Update Keras sentiment analysis example

This commit is contained in:
Matthew Honnibal 2017-11-05 17:11:00 +01:00
parent 2b35bb76ad
commit 717e8124fb

View File

@ -9,7 +9,9 @@ from keras.models import Sequential, model_from_json
from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from keras.layers import TimeDistributed from keras.layers import TimeDistributed
from keras.optimizers import Adam from keras.optimizers import Adam
import cPickle as pickle from spacy.compat import pickle
import thinc.extra.datasets
import spacy import spacy
@ -70,11 +72,14 @@ def get_features(docs, max_length):
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
j = 0 j = 0
for token in doc: for token in doc:
if token.has_vector and not token.is_punct and not token.is_space: vector_id = token.vocab.vectors.find(key=token.orth)
Xs[i, j] = token.rank + 1 if vector_id >= 0:
j += 1 Xs[i, j] = vector_id
if j >= max_length: else:
break Xs[i, j] = 0
j += 1
if j >= max_length:
break
return Xs return Xs
@ -82,12 +87,13 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
by_sentence=True): by_sentence=True):
print("Loading spaCy") print("Loading spaCy")
nlp = spacy.load('en', entity=False) nlp = spacy.load('en_vectors_web_lg')
nlp.add_pipe(nlp.create_pipe('sentencizer'))
embeddings = get_embeddings(nlp.vocab) embeddings = get_embeddings(nlp.vocab)
model = compile_lstm(embeddings, lstm_shape, lstm_settings) model = compile_lstm(embeddings, lstm_shape, lstm_settings)
print("Parsing texts...") print("Parsing texts...")
train_docs = list(nlp.pipe(train_texts, batch_size=5000, n_threads=3)) train_docs = list(nlp.pipe(train_texts))
dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3)) dev_docs = list(nlp.pipe(dev_texts))
if by_sentence: if by_sentence:
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels) train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels) dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
@ -111,9 +117,10 @@ def compile_lstm(embeddings, shape, settings):
mask_zero=True mask_zero=True
) )
) )
model.add(TimeDistributed(Dense(shape['nr_hidden'], bias=False))) model.add(TimeDistributed(Dense(shape['nr_hidden'], use_bias=False)))
model.add(Bidirectional(LSTM(shape['nr_hidden'], dropout_U=settings['dropout'], model.add(Bidirectional(LSTM(shape['nr_hidden'],
dropout_W=settings['dropout']))) recurrent_dropout=settings['dropout'],
dropout=settings['dropout'])))
model.add(Dense(shape['nr_class'], activation='sigmoid')) model.add(Dense(shape['nr_class'], activation='sigmoid'))
model.compile(optimizer=Adam(lr=settings['lr']), loss='binary_crossentropy', model.compile(optimizer=Adam(lr=settings['lr']), loss='binary_crossentropy',
metrics=['accuracy']) metrics=['accuracy'])
@ -121,12 +128,7 @@ def compile_lstm(embeddings, shape, settings):
def get_embeddings(vocab): def get_embeddings(vocab):
max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector) return vocab.vectors.data
vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
for lex in vocab:
if lex.has_vector:
vectors[lex.rank + 1] = lex.vector
return vectors
def evaluate(model_dir, texts, labels, max_length=100): def evaluate(model_dir, texts, labels, max_length=100):
@ -174,22 +176,32 @@ def read_data(data_dir, limit=0):
batch_size=("Size of minibatches for training LSTM", "option", "b", int), batch_size=("Size of minibatches for training LSTM", "option", "b", int),
nr_examples=("Limit to N examples", "option", "n", int) nr_examples=("Limit to N examples", "option", "n", int)
) )
def main(model_dir, train_dir, dev_dir, def main(model_dir=None, train_dir=None, dev_dir=None,
is_runtime=False, is_runtime=False,
nr_hidden=64, max_length=100, # Shape nr_hidden=64, max_length=100, # Shape
dropout=0.5, learn_rate=0.001, # General NN config dropout=0.5, learn_rate=0.001, # General NN config
nb_epoch=5, batch_size=100, nr_examples=-1): # Training params nb_epoch=5, batch_size=100, nr_examples=-1): # Training params
model_dir = pathlib.Path(model_dir) if model_dir is not None:
train_dir = pathlib.Path(train_dir) model_dir = pathlib.Path(model_dir)
dev_dir = pathlib.Path(dev_dir) if train_dir is None or dev_dir is None:
imdb_data = thinc.extra.datasets.imdb()
if is_runtime: if is_runtime:
dev_texts, dev_labels = read_data(dev_dir) if dev_dir is None:
dev_texts, dev_labels = zip(*imdb_data[1])
else:
dev_texts, dev_labels = read_data(dev_dir)
acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
print(acc) print(acc)
else: else:
print("Read data") if train_dir is None:
train_texts, train_labels = read_data(train_dir, limit=nr_examples) train_texts, train_labels = zip(*imdb_data[0])
dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) else:
print("Read data")
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
if dev_dir is None:
dev_texts, dev_labels = zip(*imdb_data[1])
else:
dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
train_labels = numpy.asarray(train_labels, dtype='int32') train_labels = numpy.asarray(train_labels, dtype='int32')
dev_labels = numpy.asarray(dev_labels, dtype='int32') dev_labels = numpy.asarray(dev_labels, dtype='int32')
lstm = train(train_texts, train_labels, dev_texts, dev_labels, lstm = train(train_texts, train_labels, dev_texts, dev_labels,
@ -198,10 +210,11 @@ def main(model_dir, train_dir, dev_dir,
{}, {},
nb_epoch=nb_epoch, batch_size=batch_size) nb_epoch=nb_epoch, batch_size=batch_size)
weights = lstm.get_weights() weights = lstm.get_weights()
with (model_dir / 'model').open('wb') as file_: if model_dir is not None:
pickle.dump(weights[1:], file_) with (model_dir / 'model').open('wb') as file_:
with (model_dir / 'config.json').open('wb') as file_: pickle.dump(weights[1:], file_)
file_.write(lstm.to_json()) with (model_dir / 'config.json').open('wb') as file_:
file_.write(lstm.to_json())
if __name__ == '__main__': if __name__ == '__main__':