From 31b5e58aebe34e7bc2ac1b615bb99324e0304637 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 30 Jun 2015 14:26:53 +0200
Subject: [PATCH] * Begin reorganizing neuralnet work

---
 bin/parser/nn_train.py | 171 ++++++++++++-----------------------------
 1 file changed, 49 insertions(+), 122 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index 2fc1958ed..72c9e04f1 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -23,7 +23,7 @@ from spacy.gold import GoldParse
 
 from spacy.scorer import Scorer
 
-from spacy.syntax.parser import Parser
+from spacy.syntax.parser import Parser, get_templates
 from spacy._theano import TheanoModel
 
 import theano
@@ -40,76 +40,37 @@ theano.config.floatX = 'float32'
 floatX = theano.config.floatX
 
 
-def th_share(w, name=''):
-    return theano.shared(value=w, borrow=True, name=name)
-
-class Param(object):
-    def __init__(self, numpy_data, name='?', wrapper=th_share):
-        self.curr = wrapper(numpy_data, name=name+'_curr')
-        self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
-                            name=name+'_step')
-
-    def updates(self, cost, timestep, eta, mu):
-        step = (mu * self.step) - T.grad(cost, self.curr)
-        curr = self.curr + (eta * step)
-        return [(self.curr, curr), (self.step, step)]
-
-
-class AdadeltaParam(object):
-    def __init__(self, numpy_data, name='?', wrapper=th_share):
-        self.curr = wrapper(numpy_data, name=name+'_curr')
-        # accu: accumulate gradient magnitudes
-        self.accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype))
-        # delta_accu: accumulate update magnitudes (recursively!)
-        self.delta_accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype))
-
-    def updates(self, cost, timestep, eps, rho):
-        # update accu (as in rmsprop)
-        grad = T.grad(cost, self.curr)
-        accu_new = rho * self.accu + (1 - rho) * grad ** 2
-
-        # compute parameter update, using the 'old' delta_accu
-        update = (grad * T.sqrt(self.delta_accu + eps) /
-                  T.sqrt(accu_new + eps))
-        # update delta_accu (as accu, but accumulating updates)
-        delta_accu_new = rho * self.delta_accu + (1 - rho) * update ** 2
-        return [(self.curr, self.curr - update), (self.accu, accu_new),
-                (self.delta_accu, delta_accu_new)]
-
-
-class AvgParam(object):
-    def __init__(self, numpy_data, name='?', wrapper=th_share):
-        self.curr = wrapper(numpy_data, name=name+'_curr')
-        self.avg = self.curr
-        self.avg = wrapper(numpy_data.copy(), name=name+'_avg')
-        self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
-                            name=name+'_step')
-
-    def updates(self, cost, timestep, eta, mu):
-        step = (mu * self.step) - T.grad(cost, self.curr)
-        curr = self.curr + (eta * step)
-        alpha = (1 / timestep).clip(0.001, 0.9).astype(floatX)
-        avg = ((1 - alpha) * self.avg) + (alpha * curr)
-        return [(self.curr, curr), (self.step, step), (self.avg, avg)]
-
-
-def feed_layer(activation, weights, bias, input_):
-    return activation(T.dot(input_, weights) + bias)
+def L1(L1_reg, *weights):
+    return L1_reg * sum(abs(w).sum() for w in weights)
 
 
 def L2(L2_reg, *weights):
     return L2_reg * sum((w ** 2).sum() for w in weights)
 
 
-def L1(L1_reg, *weights):
-    return L1_reg * sum(abs(w).sum() for w in weights)
+def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
+    updates = OrderedDict()
+    for param in params:
+        value = param.get_value(borrow=True)
+        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                             broadcastable=param.broadcastable)
+
+        grad = T.grad(loss, param)
+        accu_new = rho * accu + (1 - rho) * grad ** 2
+        updates[accu] = accu_new
+        updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
+    return updates
 
 
 def relu(x):
     return x * (x > 0)
 
 
-def _init_weights(n_in, n_out):
+def feed_layer(activation, weights, bias, input_):
+    return activation(T.dot(input_, weights) + bias)
+
+
+def init_weights(n_in, n_out):
     rng = numpy.random.RandomState(1235)
     
     weights = numpy.asarray(
@@ -117,57 +78,35 @@ def _init_weights(n_in, n_out):
         dtype=theano.config.floatX
     )
     bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
-    return [AvgParam(weights, name='W'), AvgParam(bias, name='b')]
+    return [wrapper(weights, name='W'), wrapper(bias, name='b')]
 
 
-def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
-    costs = T.ivector('costs')
-    is_gold = T.ivector('is_gold')
+def compile_model(n_classes, n_hidden, n_in, optimizer):
     x = T.vector('x') 
-    y = T.scalar('y')
-    y_cost = T.scalar('y_cost')
-    loss = T.scalar('cost')
-    timestep = theano.shared(1)
-    eta = T.scalar('eta').astype(floatX)
-    mu = T.scalar('mu').astype(floatX)
+    costs = T.ivector('costs')
+    loss = T.scalar('loss')
 
-    maxent_W, maxent_b = _init_weights(n_hidden, n_classes)
-    hidden_W, hidden_b = _init_weights(n_in, n_hidden)
+    maxent_W, maxent_b = init_weights(n_hidden, n_classes)
+    hidden_W, hidden_b = init_weights(n_in, n_hidden)
 
     # Feed the inputs forward through the network
     p_y_given_x = feed_layer(
                     T.nnet.softmax,
-                    maxent_W.curr,
-                    maxent_b.curr,
+                    maxent_W,
+                    maxent_b,
                       feed_layer(
                         relu,
-                        hidden_W.curr,
-                        hidden_b.curr,
+                        hidden_W,
+                        hidden_b,
                         x))
-    stabilizer = 1e-8
 
-    y_cost = costs[T.argmax(p_y_given_x[0])]
-
-    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + stabilizer)
-
-    debug = theano.function(
-        name='debug',
-        inputs=[x, costs],
-        outputs=[p_y_given_x, T.eq(costs, 0), p_y_given_x[0] * T.eq(costs, 0)],
-    )
+    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
 
     train_model = theano.function(
         name='train_model',
-        inputs=[x, costs, eta, mu],
-        outputs=[p_y_given_x[0], T.grad(loss, x), T.argmax(p_y_given_x, axis=1),
-                 loss],
-        updates=(
-            [(timestep, timestep + 1)] + 
-             maxent_W.updates(loss, timestep, eta, mu) + 
-             maxent_b.updates(loss, timestep, eta, mu) +
-             hidden_W.updates(loss, timestep, eta, mu) +
-             hidden_b.updates(loss, timestep, eta, mu)
-        ),
+        inputs=[x, costs],
+        outputs=[p_y_given_x[0], T.grad(loss, x), loss],
+        updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
         on_unused_input='warn'
     )
 
@@ -177,18 +116,18 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
         outputs=[
             feed_layer(
               T.nnet.softmax,
-              maxent_W.avg,
-              maxent_b.avg,
+              maxent_W,
+              maxent_b,
               feed_layer(
                 relu,
-                hidden_W.avg,
-                hidden_b.avg,
+                hidden_W,
+                hidden_b,
                 x
               )
             )[0]
         ]
     )
-    return debug, train_model, evaluate_model
+    return train_model, evaluate_model
 
 
 def score_model(scorer, nlp, annot_tuples, verbose=False):
@@ -202,21 +141,6 @@ def score_model(scorer, nlp, annot_tuples, verbose=False):
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
           eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
           seed=0, n_sents=0,  verbose=False):
-    def make_model(n_classes, (words, tags, labels), model_dir):
-        n_in = (nv_word  * len(words)) + \
-               (nv_tag   * len(tags)) + \
-               (nv_label * len(labels))
-        debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
-                                                               n_in, 0.0, 0.0)
-        return TheanoModel(
-            n_classes,
-            ((nv_word, words), (nv_tag, tags), (nv_label, labels)),
-            train_func,
-            predict_func,
-            model_loc=model_dir, 
-            eta=eta, mu=mu,
-            debug=debug)
-
 
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
@@ -230,21 +154,24 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 
     Config.write(dep_model_dir, 'config',
         seed=seed,
-        features=feat_set,
+        templates=tuple(),
         labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
         vector_lengths=(nv_word, nv_tag, nv_label),
         hidden_nodes=nv_hidden,
         eta=eta,
         mu=mu
     )
-    
+  
+    # Bake-in hyper-parameters
+    optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
+    nlp = Language(data_dir=model_dir)
+    n_classes = nlp.parser.model.n_classes
+    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
+    nlp.parser.model = TheanoModel(n_classes, input_spec, train,
+                                   predict, model_loc)
+ 
     if n_sents > 0:
         gold_tuples = gold_tuples[:n_sents]
-    
-    nlp = Language(data_dir=model_dir)
-    nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
-                         make_model)
-
     print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
     log_loc = path.join(model_dir, 'job.log')
     for itn in range(n_iter):