From 9d95c2617905139eec63703f6ab84b5ef092be61 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 19 Oct 2015 23:44:49 +1100
Subject: [PATCH] * Add simple deep feed-forward neural network text
 classification example.

---
 examples/nn_text_class.py | 273 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 273 insertions(+)
 create mode 100644 examples/nn_text_class.py

diff --git a/examples/nn_text_class.py b/examples/nn_text_class.py
new file mode 100644
index 000000000..4a19e5780
--- /dev/null
+++ b/examples/nn_text_class.py
@@ -0,0 +1,273 @@
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+
+from collections import defaultdict
+from pathlib import Path
+import numpy
+import plac
+
+import spacy.en
+
+
+def read_data(nlp, data_dir):
+    for subdir, label in (('pos', 1), ('neg', 0)):
+        for filename in (data_dir / subdir).iterdir():
+            text = filename.open().read()
+            doc = nlp(text)
+            if len(doc) >= 1:
+                yield doc, label
+
+
+def partition(examples, split_size):
+    examples = list(examples)
+    numpy.random.shuffle(examples)
+    n_docs = len(examples)
+    split = int(n_docs * split_size)
+    return examples[:split], examples[split:]
+
+
+def minibatch(data, bs=24):
+    for i in range(0, len(data), bs):
+        yield data[i:i+bs]
+
+
+class Extractor(object):
+    def __init__(self, nlp, vector_length, dropout=0.3):
+        self.nlp = nlp
+        self.dropout = dropout
+        self.vector = numpy.zeros((vector_length, ))
+
+    def doc2bow(self, doc, dropout=None):
+        if dropout is None:
+            dropout = self.dropout
+        bow = defaultdict(int)
+        all_words = defaultdict(int)
+        for word in doc:
+            if numpy.random.random() >= dropout and not word.is_punct:
+                bow[word.lower] += 1
+            all_words[word.lower] += 1
+        if sum(bow.values()) >= 1:
+            return bow
+        else:
+            return all_words
+
+    def bow2vec(self, bow, E):
+        self.vector.fill(0)
+        n = 0
+        for orth_id, freq in bow.items():
+            self.vector += self.nlp.vocab[self.nlp.vocab.strings[orth_id]].repvec * freq
+            # Apply the fine-tuning we've learned
+            if orth_id < E.shape[0]:
+                self.vector += E[orth_id] * freq
+            n += freq
+        return self.vector / n
+
+
+class NeuralNetwork(object):
+    def __init__(self, depth, width, n_classes, n_vocab, extracter, optimizer):
+        self.depth = depth
+        self.width = width
+        self.n_classes = n_classes
+        self.weights = Params.random(depth, width, width, n_classes, n_vocab)
+        self.doc2bow = extracter.doc2bow
+        self.bow2vec = extracter.bow2vec
+        self.optimizer = optimizer
+        self._gradient = Params.zero(depth, width, width, n_classes, n_vocab)
+        self._activity = numpy.zeros((depth, width))
+
+    def train(self, batch):
+        activity = self._activity
+        gradient = self._gradient
+        activity.fill(0)
+        gradient.data.fill(0)
+        loss = 0
+        word_freqs = defaultdict(int)
+        for doc, label in batch:
+            word_ids = self.doc2bow(doc)
+            vector = self.bow2vec(word_ids, self.weights.E)
+            self.forward(activity, vector)
+            loss += self.backprop(vector, gradient, activity, word_ids, label)
+            for w, freq in word_ids.items():
+                word_freqs[w] += freq
+        self.optimizer(self.weights, gradient, len(batch), word_freqs)
+        return loss
+
+    def predict(self, doc):
+        actv = self._activity
+        actv.fill(0)
+        W = self.weights.W
+        b = self.weights.b
+        E = self.weights.E
+        
+        vector = self.bow2vec(self.doc2bow(doc, dropout=0.0), E)
+        self.forward(actv, vector)
+        return numpy.argmax(softmax(actv[-1], W[-1], b[-1]))
+
+    def forward(self, actv, in_):
+        actv.fill(0)
+        W = self.weights.W; b = self.weights.b
+        actv[0] = relu(in_, W[0], b[0])
+        for i in range(1, self.depth):
+            actv[i] = relu(actv[i-1], W[i], b[i])
+
+    def backprop(self, input_vector, gradient, activity, ids, label):
+        W = self.weights.W
+        b = self.weights.b
+
+        target = numpy.zeros(self.n_classes)
+        target[label] = 1.0
+        pred = softmax(activity[-1], W[-1], b[-1])
+        delta = pred - target
+
+        for i in range(self.depth, 0, -1):
+            gradient.b[i] += delta
+            gradient.W[i] += numpy.outer(delta, activity[i-1])
+            delta = d_relu(activity[i-1]) * W[i].T.dot(delta)
+
+        gradient.b[0] += delta
+        gradient.W[0] += numpy.outer(delta, input_vector)
+        tuning = W[0].T.dot(delta).reshape((self.width,)) / len(ids)
+        for w, freq in ids.items():
+            if w < gradient.E.shape[0]:
+                gradient.E[w] += tuning * freq
+        return -sum(target * numpy.log(pred))
+
+
+def softmax(actvn, W, b):
+    w = W.dot(actvn) + b
+    ew = numpy.exp(w - max(w))
+    return (ew / sum(ew)).ravel()
+
+
+def relu(actvn, W, b):
+    x = W.dot(actvn) + b
+    return x * (x > 0)
+
+
+def d_relu(x):
+    return x > 0
+
+
+class Adagrad(object):
+    def __init__(self, lr, rho):
+        self.eps = 1e-3
+        # initial learning rate
+        self.learning_rate = lr
+        self.rho = rho
+        # stores sum of squared gradients 
+        #self.h = numpy.zeros(self.dim)
+        #self._curr_rate = numpy.zeros(self.h.shape)
+        self.h = None
+        self._curr_rate = None
+    
+    def __call__(self, weights, gradient, batch_size, word_freqs):
+        if self.h is None:
+            self.h = numpy.zeros(gradient.data.shape)
+            self._curr_rate = numpy.zeros(gradient.data.shape)
+        self.L2_penalty(gradient, weights, word_freqs)
+        update = self.rescale(gradient.data / batch_size)
+        weights.data -= update
+
+    def rescale(self, gradient):
+        if self.h is None:
+            self.h = numpy.zeros(gradient.data.shape)
+            self._curr_rate = numpy.zeros(gradient.data.shape)
+        self._curr_rate.fill(0)
+        self.h += gradient ** 2
+        self._curr_rate = self.learning_rate / (numpy.sqrt(self.h) + self.eps)
+        return self._curr_rate * gradient
+
+    def L2_penalty(self, gradient, weights, word_freqs):
+        # L2 Regularization
+        for i in range(len(weights.W)):
+            gradient.W[i] += weights.W[i] * self.rho
+            gradient.b[i] += weights.b[i] * self.rho
+        for w, freq in word_freqs.items():
+            if w < gradient.E.shape[0]:
+                gradient.E[w] += weights.E[w] * self.rho
+
+
+class Params(object):
+    @classmethod
+    def zero(cls, depth, n_embed, n_hidden, n_labels, n_vocab):
+        return cls(depth, n_embed, n_hidden, n_labels, n_vocab, lambda x: numpy.zeros((x,)))
+
+    @classmethod
+    def random(cls, depth, nE, nH, nL, nV):
+        return cls(depth, nE, nH, nL, nV, lambda x: (numpy.random.rand(x) * 2 - 1) * 0.08)
+
+    def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, initializer):
+        nE = n_embed; nH = n_hidden; nL = n_labels; nV = n_vocab
+        n_weights = sum([
+            (nE * nH) + nH, 
+            (nH * nH  + nH) * depth,
+            (nH * nL) + nL,
+            (nV * nE)
+        ])
+        self.data = initializer(n_weights)
+        self.W = []
+        self.b = []
+        i = self._add_layer(0, nE, nH)
+        for _ in range(1, depth):
+            i = self._add_layer(i, nH, nH)
+        i = self._add_layer(i, nL, nH)
+        self.E = self.data[i : i + (nV * nE)].reshape((nV, nE))
+        self.E.fill(0)
+
+    def _add_layer(self, start, x, y):
+        end = start + (x * y)
+        self.W.append(self.data[start : end].reshape((x, y)))
+        self.b.append(self.data[end : end + x].reshape((x, )))
+        return end + x
+
+
+@plac.annotations(
+    data_dir=("Data directory", "positional", None, Path),
+    n_iter=("Number of iterations (epochs)", "option", "i", int),
+    width=("Size of hidden layers", "option", "H", int),
+    depth=("Depth", "option", "d", int),
+    dropout=("Drop-out rate", "option", "r", float),
+    rho=("Regularization penalty", "option", "p", float),
+    eta=("Learning rate", "option", "e", float),
+    batch_size=("Batch size", "option", "b", int),
+    vocab_size=("Number of words to fine-tune", "option", "w", int),
+)
+def main(data_dir, depth=3, width=300, n_iter=5, vocab_size=40000,
+         batch_size=24, dropout=0.3, rho=1e-5, eta=0.005):
+    n_classes = 2
+    print("Loading")
+    nlp = spacy.en.English(parser=False)
+    train_data, dev_data = partition(read_data(nlp, data_dir / 'train'), 0.8)
+    print("Begin training")
+    extracter = Extractor(nlp, width, dropout=0.3)
+    optimizer = Adagrad(eta, rho)
+    model = NeuralNetwork(depth, width, n_classes, vocab_size, extracter, optimizer)
+    prev_best = 0
+    best_weights = None
+    for epoch in range(n_iter):
+        numpy.random.shuffle(train_data)
+        train_loss = 0.0
+        for batch in minibatch(train_data, bs=batch_size):
+            train_loss += model.train(batch)
+        n_correct = sum(model.predict(x) == y for x, y in dev_data)
+        print(epoch, train_loss, n_correct / len(dev_data))
+        if n_correct >= prev_best:
+            best_weights = model.weights.data.copy()
+            prev_best = n_correct
+
+    model.weights.data = best_weights
+    print("Evaluating")
+    eval_data = list(read_data(nlp, data_dir / 'test'))
+    n_correct = sum(model.predict(x) == y for x, y in eval_data)
+    print(n_correct / len(eval_data))
+ 
+
+
+if __name__ == '__main__':
+    #import cProfile
+    #import pstats
+    #cProfile.runctx("main(Path('data/aclImdb'))", globals(), locals(), "Profile.prof")
+    #s = pstats.Stats("Profile.prof")
+    #s.strip_dirs().sort_stats("time").print_stats(100)
+    plac.call(main)