Fix entailment example, and add a flag for BiRNN encoding.

This commit is contained in:
Matthew Honnibal 2016-11-12 11:43:37 -06:00
parent 2ee66117ba
commit 89df91846c
3 changed files with 70 additions and 30 deletions

View File

@ -3,25 +3,46 @@ import spacy
import plac import plac
from pathlib import Path from pathlib import Path
import ujson as json
import numpy
from keras.utils.np_utils import to_categorical
from spacy_hook import get_embeddings, get_word_ids from spacy_hook import get_embeddings, get_word_ids
from spacy_hook import create_similarity_pipeline from spacy_hook import create_similarity_pipeline
from keras_decomposable_attention import build_model from keras_decomposable_attention import build_model
def train(model_dir, train_loc, dev_loc, shape, settings): def train(model_dir, train_loc, dev_loc, shape, settings):
train_texts1, train_texts2, train_labels = read_snli(train_loc)
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
print("Loading spaCy") print("Loading spaCy")
nlp = spacy.load('en', tagger=False, parser=False, entity=False, matcher=False) nlp = spacy.load('en')
print("Compiling network") print("Compiling network")
model = build_model(get_embeddings(nlp.vocab), shape, settings) model = build_model(get_embeddings(nlp.vocab), shape, settings)
print("Processing texts...") print("Processing texts...")
train_X = get_features(list(nlp.pipe(train_texts))) train_X1 = get_word_ids(list(nlp.pipe(train_texts1, n_threads=10, batch_size=10000)),
dev_X = get_features(list(nlp.pipe(dev_texts))) max_length=shape[0],
tree_truncate=settings['tree_truncate'])
train_X2 = get_word_ids(list(nlp.pipe(train_texts2, n_threads=10, batch_size=10000)),
max_length=shape[0],
tree_truncate=settings['tree_truncate'])
dev_X1 = get_word_ids(list(nlp.pipe(dev_texts1, n_threads=10, batch_size=10000)),
max_length=shape[0],
tree_truncate=settings['tree_truncate'])
dev_X2 = get_word_ids(list(nlp.pipe(dev_texts2, n_threads=10, batch_size=10000)),
max_length=shape[0],
tree_truncate=settings['tree_truncate'])
print(train_X1.shape, train_X2.shape)
print(dev_X1.shape, dev_X2.shape)
print(train_labels.shape, dev_labels.shape)
print(settings)
model.fit( model.fit(
train_X, [train_X1, train_X2],
train_labels, train_labels,
validation_data=(dev_X, dev_labels), validation_data=([dev_X1, dev_X2], dev_labels),
nb_epoch=settings['nr_epoch'], nb_epoch=settings['nr_epoch'],
batch_size=settings['batch_size']) batch_size=settings['batch_size'])
@ -56,16 +77,20 @@ def demo(model_dir):
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2} LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
def read_snli(loc): def read_snli(path):
with open(loc) as file_: texts1 = []
texts2 = []
labels = []
with path.open() as file_:
for line in file_: for line in file_:
eg = json.loads(line) eg = json.loads(line)
label = eg['gold_label'] label = eg['gold_label']
if label == '-': if label == '-':
continue continue
text1 = eg['sentence1'] texts1.append(eg['sentence1'])
text2 = eg['sentence2'] texts2.append(eg['sentence2'])
yield text1, text2, LABELS[label] labels.append(LABELS[label])
return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32'))
@plac.annotations( @plac.annotations(
@ -78,9 +103,13 @@ def read_snli(loc):
dropout=("Dropout level", "option", "d", float), dropout=("Dropout level", "option", "d", float),
learn_rate=("Learning rate", "option", "e", float), learn_rate=("Learning rate", "option", "e", float),
batch_size=("Batch size for neural network training", "option", "b", float), batch_size=("Batch size for neural network training", "option", "b", float),
nr_epoch=("Number of training epochs", "option", "i", float) nr_epoch=("Number of training epochs", "option", "i", float),
tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool),
gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool),
) )
def main(mode, model_dir, train_loc, dev_loc, def main(mode, model_dir, train_loc, dev_loc,
tree_truncate=False,
gru_encode=False,
max_length=100, max_length=100,
nr_hidden=100, nr_hidden=100,
dropout=0.2, dropout=0.2,
@ -92,7 +121,9 @@ def main(mode, model_dir, train_loc, dev_loc,
'lr': learn_rate, 'lr': learn_rate,
'dropout': dropout, 'dropout': dropout,
'batch_size': batch_size, 'batch_size': batch_size,
'nr_epoch': nr_epoch 'nr_epoch': nr_epoch,
'tree_truncate': tree_truncate,
'gru_encode': gru_encode
} }
if mode == 'train': if mode == 'train':
train(model_dir, train_loc, dev_loc, shape, settings) train(model_dir, train_loc, dev_loc, shape, settings)
@ -101,6 +132,5 @@ def main(mode, model_dir, train_loc, dev_loc,
else: else:
demo(model_dir) demo(model_dir)
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) plac.call(main)

View File

@ -4,6 +4,7 @@ import numpy
from keras.layers import InputSpec, Layer, Input, Dense, merge from keras.layers import InputSpec, Layer, Input, Dense, merge
from keras.layers import Activation, Dropout, Embedding, TimeDistributed from keras.layers import Activation, Dropout, Embedding, TimeDistributed
from keras.layers import Bidirectional, GRU
import keras.backend as K import keras.backend as K
import theano.tensor as T import theano.tensor as T
from keras.models import Sequential, Model, model_from_json from keras.models import Sequential, Model, model_from_json
@ -12,7 +13,7 @@ from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization from keras.layers.normalization import BatchNormalization
def build_model(vectors, shape, settings, use_rnn_encoding=False): def build_model(vectors, shape, settings):
'''Compile the model.''' '''Compile the model.'''
max_length, nr_hidden, nr_class = shape max_length, nr_hidden, nr_class = shape
# Declare inputs. # Declare inputs.
@ -21,8 +22,8 @@ def build_model(vectors, shape, settings, use_rnn_encoding=False):
# Construct operations, which we'll chain together. # Construct operations, which we'll chain together.
embed = _StaticEmbedding(vectors, max_length, nr_hidden) embed = _StaticEmbedding(vectors, max_length, nr_hidden)
if use_rnn_encoding: if settings['gru_encode']:
encode = _BiLSTMEncode(max_length, nr_hidden) encode = _BiRNNEncoding(max_length, nr_hidden)
attend = _Attention(max_length, nr_hidden) attend = _Attention(max_length, nr_hidden)
align = _SoftAlignment(max_length, nr_hidden) align = _SoftAlignment(max_length, nr_hidden)
compare = _Comparison(max_length, nr_hidden) compare = _Comparison(max_length, nr_hidden)
@ -32,7 +33,7 @@ def build_model(vectors, shape, settings, use_rnn_encoding=False):
sent1 = embed(ids1) # Shape: (i, n) sent1 = embed(ids1) # Shape: (i, n)
sent2 = embed(ids2) # Shape: (j, n) sent2 = embed(ids2) # Shape: (j, n)
if use_rnn_encoding: if settings['gru_encode']:
sent1 = encode(sent1) sent1 = encode(sent1)
sent2 = encode(sent2) sent2 = encode(sent2)
@ -78,15 +79,18 @@ class _StaticEmbedding(object):
def __call__(self, sentence): def __call__(self, sentence):
return self.project(self.embed(sentence)) return self.project(self.embed(sentence))
class _BiRNNEncoding(object): class _BiRNNEncoding(object):
def __init__(self, max_length, nr_out): def __init__(self, max_length, nr_out):
self.model = Sequential() self.model = Sequential()
self.model.add(Bidirectional(LSTM(nr_out, input_length=max_length))) self.model.add(Bidirectional(GRU(int(nr_out/2), return_sequences=True),
input_shape=(max_length, nr_out)))
def __call__(self, sentence): def __call__(self, sentence):
return self.model(sentence) return self.model(sentence)
class _Attention(object): class _Attention(object):
def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'): def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'):
self.max_length = max_length self.max_length = max_length

View File

@ -1,4 +1,5 @@
from keras.models import model_from_json from keras.models import model_from_json
import numpy
class KerasSimilarityShim(object): class KerasSimilarityShim(object):
@ -30,7 +31,7 @@ class KerasSimilarityShim(object):
return scores[0] return scores[0]
def get_embeddings(cls, vocab): def get_embeddings(vocab):
max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector) max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32') vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
for lex in vocab: for lex in vocab:
@ -39,16 +40,24 @@ def get_embeddings(cls, vocab):
return vectors return vectors
def get_word_ids(docs, max_length=100): def get_word_ids(docs, tree_truncate=False, max_length=100):
Xs = numpy.zeros((len(docs), max_length), dtype='int32') Xs = numpy.zeros((len(docs), max_length), dtype='int32')
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
j = 0 j = 0
for token in doc: queue = [sent.root for sent in doc.sents]
if token.has_vector and not token.is_punct and not token.is_space: words = []
Xs[i, j] = token.rank + 1 while len(words) <= max_length and queue:
j += 1 word = queue.pop(0)
if j >= max_length: if word.has_vector and not word.is_punct and not word.is_space:
break words.append(word)
queue.extend(list(word.lefts))
queue.extend(list(word.rights))
words.sort()
for j, token in enumerate(words):
Xs[i, j] = token.rank + 1
j += 1
if j >= max_length:
break
return Xs return Xs
@ -57,6 +66,3 @@ def create_similarity_pipeline(nlp):
nlp.path / 'similarity', nlp.path / 'similarity',
nlp, nlp,
feature_extracter=get_features)] feature_extracter=get_features)]