2017-11-07 03:22:30 +03:00
"""
This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy . spaCy splits the document into sentences , and each sentence is classified using the LSTM . The scores for the sentences are then aggregated to give the document score . This kind of hierarchical model is quite difficult in " pure " Keras or Tensorflow , but it ' s very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they ' re a fixed size . This hurts review accuracy a lot , because people often summarise their rating in the final sentence
Prerequisites :
spacy download en_vectors_web_lg
pip install keras == 2.0 .9
Compatible with : spaCy v2 .0 .0 +
"""
2016-10-20 03:49:14 +03:00
import plac
import random
2016-10-20 04:42:34 +03:00
import pathlib
2016-10-20 03:49:14 +03:00
import cytoolz
2016-10-19 15:43:13 +03:00
import numpy
2016-10-20 04:21:56 +03:00
from keras . models import Sequential , model_from_json
2017-11-07 03:22:30 +03:00
from keras . layers import LSTM , Dense , Embedding , Bidirectional
2016-10-20 22:32:26 +03:00
from keras . layers import TimeDistributed
2016-10-20 05:39:54 +03:00
from keras . optimizers import Adam
2017-11-05 19:11:00 +03:00
import thinc . extra . datasets
2017-11-07 03:22:30 +03:00
from spacy . compat import pickle
2016-10-19 15:43:13 +03:00
import spacy
class SentimentAnalyser ( object ) :
@classmethod
2016-10-24 00:17:41 +03:00
def load ( cls , path , nlp , max_length = 100 ) :
2016-10-20 03:49:14 +03:00
with ( path / ' config.json ' ) . open ( ) as file_ :
model = model_from_json ( file_ . read ( ) )
with ( path / ' model ' ) . open ( ' rb ' ) as file_ :
lstm_weights = pickle . load ( file_ )
embeddings = get_embeddings ( nlp . vocab )
model . set_weights ( [ embeddings ] + lstm_weights )
2016-10-24 00:17:41 +03:00
return cls ( model , max_length = max_length )
2016-10-19 15:43:13 +03:00
2016-10-24 00:17:41 +03:00
def __init__ ( self , model , max_length = 100 ) :
2016-10-19 15:43:13 +03:00
self . _model = model
2016-10-24 00:17:41 +03:00
self . max_length = max_length
2016-10-20 03:49:14 +03:00
2016-10-19 15:43:13 +03:00
def __call__ ( self , doc ) :
X = get_features ( [ doc ] , self . max_length )
2016-10-19 20:37:09 +03:00
y = self . _model . predict ( X )
2016-10-19 15:43:13 +03:00
self . set_sentiment ( doc , y )
def pipe ( self , docs , batch_size = 1000 , n_threads = 2 ) :
2016-10-20 03:49:14 +03:00
for minibatch in cytoolz . partition_all ( batch_size , docs ) :
2016-10-24 00:17:41 +03:00
minibatch = list ( minibatch )
sentences = [ ]
for doc in minibatch :
sentences . extend ( doc . sents )
Xs = get_features ( sentences , self . max_length )
2016-10-20 03:49:14 +03:00
ys = self . _model . predict ( Xs )
2016-10-24 00:17:41 +03:00
for sent , label in zip ( sentences , ys ) :
sent . doc . sentiment + = label - 0.5
for doc in minibatch :
yield doc
2016-10-19 15:43:13 +03:00
def set_sentiment ( self , doc , y ) :
2016-10-20 03:49:14 +03:00
doc . sentiment = float ( y [ 0 ] )
# Sentiment has a native slot for a single float.
# For arbitrary data storage, there's:
# doc.user_data['my_data'] = y
2016-10-19 15:43:13 +03:00
2016-10-24 00:17:41 +03:00
def get_labelled_sentences ( docs , doc_labels ) :
labels = [ ]
sentences = [ ]
for doc , y in zip ( docs , doc_labels ) :
for sent in doc . sents :
sentences . append ( sent )
labels . append ( y )
return sentences , numpy . asarray ( labels , dtype = ' int32 ' )
2016-10-19 20:37:09 +03:00
def get_features ( docs , max_length ) :
2016-10-20 22:32:26 +03:00
docs = list ( docs )
Xs = numpy . zeros ( ( len ( docs ) , max_length ) , dtype = ' int32 ' )
2016-10-20 03:49:14 +03:00
for i , doc in enumerate ( docs ) :
2016-10-20 22:32:26 +03:00
j = 0
for token in doc :
2017-11-05 19:11:00 +03:00
vector_id = token . vocab . vectors . find ( key = token . orth )
if vector_id > = 0 :
Xs [ i , j ] = vector_id
else :
Xs [ i , j ] = 0
j + = 1
if j > = max_length :
break
2016-10-19 20:37:09 +03:00
return Xs
2016-10-20 03:49:14 +03:00
def train ( train_texts , train_labels , dev_texts , dev_labels ,
2017-11-07 03:22:30 +03:00
lstm_shape , lstm_settings , lstm_optimizer , batch_size = 100 ,
nb_epoch = 5 , by_sentence = True ) :
2016-10-24 00:17:41 +03:00
print ( " Loading spaCy " )
2017-11-05 19:11:00 +03:00
nlp = spacy . load ( ' en_vectors_web_lg ' )
nlp . add_pipe ( nlp . create_pipe ( ' sentencizer ' ) )
2016-10-20 03:49:14 +03:00
embeddings = get_embeddings ( nlp . vocab )
model = compile_lstm ( embeddings , lstm_shape , lstm_settings )
2016-10-24 00:17:41 +03:00
print ( " Parsing texts... " )
2017-11-05 19:11:00 +03:00
train_docs = list ( nlp . pipe ( train_texts ) )
dev_docs = list ( nlp . pipe ( dev_texts ) )
2016-10-24 00:17:41 +03:00
if by_sentence :
train_docs , train_labels = get_labelled_sentences ( train_docs , train_labels )
dev_docs , dev_labels = get_labelled_sentences ( dev_docs , dev_labels )
2017-11-07 03:22:30 +03:00
2016-10-24 00:17:41 +03:00
train_X = get_features ( train_docs , lstm_shape [ ' max_length ' ] )
dev_X = get_features ( dev_docs , lstm_shape [ ' max_length ' ] )
2016-10-20 05:39:54 +03:00
model . fit ( train_X , train_labels , validation_data = ( dev_X , dev_labels ) ,
2016-10-20 03:49:14 +03:00
nb_epoch = nb_epoch , batch_size = batch_size )
return model
def compile_lstm ( embeddings , shape , settings ) :
2016-10-19 15:43:13 +03:00
model = Sequential ( )
model . add (
Embedding (
embeddings . shape [ 0 ] ,
2016-10-20 04:42:34 +03:00
embeddings . shape [ 1 ] ,
2016-10-19 15:43:13 +03:00
input_length = shape [ ' max_length ' ] ,
trainable = False ,
2016-10-20 22:32:26 +03:00
weights = [ embeddings ] ,
mask_zero = True
2016-10-19 15:43:13 +03:00
)
)
2017-11-05 19:11:00 +03:00
model . add ( TimeDistributed ( Dense ( shape [ ' nr_hidden ' ] , use_bias = False ) ) )
model . add ( Bidirectional ( LSTM ( shape [ ' nr_hidden ' ] ,
recurrent_dropout = settings [ ' dropout ' ] ,
dropout = settings [ ' dropout ' ] ) ) )
2016-10-19 15:43:13 +03:00
model . add ( Dense ( shape [ ' nr_class ' ] , activation = ' sigmoid ' ) )
2016-10-20 05:39:54 +03:00
model . compile ( optimizer = Adam ( lr = settings [ ' lr ' ] ) , loss = ' binary_crossentropy ' ,
metrics = [ ' accuracy ' ] )
2016-10-19 15:43:13 +03:00
return model
def get_embeddings ( vocab ) :
2017-11-05 19:11:00 +03:00
return vocab . vectors . data
2016-10-19 15:43:13 +03:00
2016-10-24 00:17:41 +03:00
def evaluate ( model_dir , texts , labels , max_length = 100 ) :
2016-10-19 15:43:13 +03:00
def create_pipeline ( nlp ) :
'''
This could be a lambda , but named functions are easier to read in Python .
'''
2016-10-24 00:17:41 +03:00
return [ nlp . tagger , nlp . parser , SentimentAnalyser . load ( model_dir , nlp ,
max_length = max_length ) ]
2017-11-07 03:22:30 +03:00
2016-10-24 00:17:41 +03:00
nlp = spacy . load ( ' en ' )
nlp . pipeline = create_pipeline ( nlp )
2016-10-20 03:49:14 +03:00
2016-10-24 00:17:41 +03:00
correct = 0
2017-11-07 03:22:30 +03:00
i = 0
2016-10-19 15:43:13 +03:00
for doc in nlp . pipe ( texts , batch_size = 1000 , n_threads = 4 ) :
2016-10-24 00:17:41 +03:00
correct + = bool ( doc . sentiment > = 0.5 ) == bool ( labels [ i ] )
i + = 1
return float ( correct ) / i
2016-10-19 15:43:13 +03:00
def read_data ( data_dir , limit = 0 ) :
examples = [ ]
for subdir , label in ( ( ' pos ' , 1 ) , ( ' neg ' , 0 ) ) :
for filename in ( data_dir / subdir ) . iterdir ( ) :
with filename . open ( ) as file_ :
2016-10-20 03:49:14 +03:00
text = file_ . read ( )
2016-10-19 15:43:13 +03:00
examples . append ( ( text , label ) )
random . shuffle ( examples )
if limit > = 1 :
examples = examples [ : limit ]
return zip ( * examples ) # Unzips into two lists
@plac.annotations (
2016-10-20 04:21:56 +03:00
train_dir = ( " Location of training file or directory " ) ,
dev_dir = ( " Location of development file or directory " ) ,
2016-10-19 15:43:13 +03:00
model_dir = ( " Location of output model directory " , ) ,
is_runtime = ( " Demonstrate run-time usage " , " flag " , " r " , bool ) ,
2016-10-20 04:21:56 +03:00
nr_hidden = ( " Number of hidden units " , " option " , " H " , int ) ,
max_length = ( " Maximum sentence length " , " option " , " L " , int ) ,
dropout = ( " Dropout " , " option " , " d " , float ) ,
2016-10-20 05:39:54 +03:00
learn_rate = ( " Learn rate " , " option " , " e " , float ) ,
2016-10-20 04:21:56 +03:00
nb_epoch = ( " Number of training epochs " , " option " , " i " , int ) ,
batch_size = ( " Size of minibatches for training LSTM " , " option " , " b " , int ) ,
nr_examples = ( " Limit to N examples " , " option " , " n " , int )
2016-10-19 15:43:13 +03:00
)
2017-11-05 19:11:00 +03:00
def main ( model_dir = None , train_dir = None , dev_dir = None ,
2016-10-19 15:43:13 +03:00
is_runtime = False ,
nr_hidden = 64 , max_length = 100 , # Shape
2016-10-20 05:39:54 +03:00
dropout = 0.5 , learn_rate = 0.001 , # General NN config
2016-10-19 15:43:13 +03:00
nb_epoch = 5 , batch_size = 100 , nr_examples = - 1 ) : # Training params
2017-11-05 19:11:00 +03:00
if model_dir is not None :
model_dir = pathlib . Path ( model_dir )
if train_dir is None or dev_dir is None :
imdb_data = thinc . extra . datasets . imdb ( )
2016-10-19 15:43:13 +03:00
if is_runtime :
2017-11-05 19:11:00 +03:00
if dev_dir is None :
dev_texts , dev_labels = zip ( * imdb_data [ 1 ] )
else :
dev_texts , dev_labels = read_data ( dev_dir )
2016-10-24 00:17:41 +03:00
acc = evaluate ( model_dir , dev_texts , dev_labels , max_length = max_length )
print ( acc )
2016-10-19 15:43:13 +03:00
else :
2017-11-05 19:11:00 +03:00
if train_dir is None :
train_texts , train_labels = zip ( * imdb_data [ 0 ] )
else :
print ( " Read data " )
train_texts , train_labels = read_data ( train_dir , limit = nr_examples )
if dev_dir is None :
dev_texts , dev_labels = zip ( * imdb_data [ 1 ] )
else :
dev_texts , dev_labels = read_data ( dev_dir , imdb_data , limit = nr_examples )
2016-10-20 05:39:54 +03:00
train_labels = numpy . asarray ( train_labels , dtype = ' int32 ' )
dev_labels = numpy . asarray ( dev_labels , dtype = ' int32 ' )
2016-10-19 15:43:13 +03:00
lstm = train ( train_texts , train_labels , dev_texts , dev_labels ,
2016-10-20 04:42:34 +03:00
{ ' nr_hidden ' : nr_hidden , ' max_length ' : max_length , ' nr_class ' : 1 } ,
2016-12-21 01:26:38 +03:00
{ ' dropout ' : dropout , ' lr ' : learn_rate } ,
2016-10-19 15:43:13 +03:00
{ } ,
nb_epoch = nb_epoch , batch_size = batch_size )
2016-10-20 03:49:14 +03:00
weights = lstm . get_weights ( )
2017-11-05 19:11:00 +03:00
if model_dir is not None :
with ( model_dir / ' model ' ) . open ( ' wb ' ) as file_ :
pickle . dump ( weights [ 1 : ] , file_ )
with ( model_dir / ' config.json ' ) . open ( ' wb ' ) as file_ :
file_ . write ( lstm . to_json ( ) )
2016-10-19 15:43:13 +03:00
if __name__ == ' __main__ ' :
plac . call ( main )