diff --git a/examples/deep_learning_keras.py b/examples/deep_learning_keras.py new file mode 100644 index 000000000..ede640f71 --- /dev/null +++ b/examples/deep_learning_keras.py @@ -0,0 +1,164 @@ +import numpy +from collections import defaultdict + +import spacy + + +class SentimentAnalyser(object): + @classmethod + def load(cls, path, nlp): + pass + + def __init__(self, model): + self._model = model + + def __call__(self, doc): + X = get_features([doc], self.max_length) + y = self._keras_model.predict(X) + self.set_sentiment(doc, y) + + def pipe(self, docs, batch_size=1000, n_threads=2): + for minibatch in partition_all(batch_size, docs): + Xs = _get_features(minibatch) + ys = self._model.predict(X) + for i, doc in enumerate(minibatch): + doc.user_data['sentiment'] = ys[i] + + def set_sentiment(self, doc, y): + doc.user_data['sentiment'] = y + + +def compile_lstm(embeddings, shape, settings, optimizer): + model = Sequential() + model.add( + Embedding( + embeddings.shape[1], + embeddings.shape[0], + input_length=shape['max_length'], + trainable=False, + weights=[embeddings] + ) + ) + model.add(Bidirectional(LSTM(shape['nr_hidden']))) + model.add(Dropout(settings['dropout'])) + model.add(Dense(shape['nr_class'], activation='sigmoid')) + return model + + +def get_embeddings(vocab): + ''' + Get a numpy vector of the word embeddings. The Lexeme.rank attribute will + be the index into the table. We're going to be "decadent" here and use + 1m vectors, because we're not going to fine-tune them. + ''' + max_rank = max(lex.rank for lex in nlp.vocab if lex.has_vector) + vectors = numpy.ndarray((max_rank+1, nlp.vocab.vectors_length), dtype='float32') + for lex in vocab: + if lex.has_vector: + vectors[lex.rank] = lex.vector + return vectors + + +def get_features(docs, max_length): + Xs = numpy.zeros(len(docs), max_length, dtype='int32') + for i, doc in enumerate(minibatch): + for j, token in enumerate(doc[:max_length]): + Xs[i, j] = token.rank if token.has_vector else 0 + return Xs + + +def train(train_texts, train_labels, dev_texts, dev_labels, + lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5): + nlp = spacy.load('en', parser=False, tagger=False, entity=False) + model = _compile_model( + _get_embeddings( + nlp.vocab), + lstm_shape, + lstm_settings, + lstm_optimizer) + model.fit( + _get_features( + nlp.pipe( + train_texts)), + train_ys, + _get_features( + nlp.pipe( + dev_texts)), + dev_ys, + nb_epoch=nb_epoch, + batch_size=batch_size) + model.save(model_dir) + + +def demonstrate_runtime(model_dir, texts): + '''Demonstrate runtime usage of the custom sentiment model with spaCy. + + Here we return a dictionary mapping entities to the average sentiment of the + documents they occurred in. + ''' + def create_pipeline(nlp): + ''' + This could be a lambda, but named functions are easier to read in Python. + ''' + return [nlp.tagger, nlp.entity, SentimentAnalyser.load(model_dir, nlp)] + + nlp = spacy.load('en', create_pipeline=create_pipeline) + entity_sentiments = defaultdict(float) + entity_freqs = defaultdict(int) + for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): + sentiment = doc.user_data['sentiment'] + for ent in doc.ents: + entity_sentiments[ent.text] += sentiment + entity_freqs[ent.text] += 1 + # Compute estimate of P(sentiment | entity) + for entity, sentiment in entity_freqs.items(): + entity_sentiments[entity] /= entity_freqs[entity] + return entity_sentiments + + +def read_data(data_dir, limit=0): + examples = [] + for subdir, label in (('pos', 1), ('neg', 0)): + for filename in (data_dir / subdir).iterdir(): + with filename.open() as file_: + text = filename.read() + examples.append((text, label)) + random.shuffle(examples) + if limit >= 1: + examples = examples[:limit] + return zip(*examples) # Unzips into two lists + + +@plac.annotations( + language=("The language to train", "positional", None, str, ['en','de', 'zh']), + train_loc=("Location of training file or directory"), + dev_loc=("Location of development file or directory"), + model_dir=("Location of output model directory",), + is_runtime=("Demonstrate run-time usage", "flag", "r", bool), + nr_hidden=("Number of hidden units", "flag", "H", int), + max_length=("Maximum sentence length", "flag", "L", int), + dropout=("Dropout", "flag", "d", float), + nr_epoch=("Number of training epochs", "flag", "i", int), + batch_size=("Size of minibatches for training LSTM", "flag", "b", int), + nr_examples=("Limit to N examples", "flag", "n", int) +) +def main(model_dir, train_dir, dev_dir, + is_runtime=False, + nr_hidden=64, max_length=100, # Shape + dropout=0.5, # General NN config + nb_epoch=5, batch_size=100, nr_examples=-1): # Training params + if is_runtime: + dev_texts, dev_labels = read_dev(dev_dir) + demonstrate_runtime(model_dir, dev_texts) + else: + train_texts, train_labels = read_data(train_dir, limit=nr_examples) + dev_texts, dev_labels = read_dev(dev_dir) + lstm = train(train_texts, train_labels, dev_texts, dev_labels, + {'nr_hidden': nr_hidden, 'max_length': max_length}, + {'dropout': 0.5}, + {}, + nb_epoch=nb_epoch, batch_size=batch_size) + + +if __name__ == '__main__': + plac.call(main)