From 6e19044d440513b40bdde40616b9acd8d1d9f4dd Mon Sep 17 00:00:00 2001
From: esmason <esmason66@gmail.com>
Date: Fri, 1 Jul 2016 14:11:31 -0700
Subject: [PATCH] testing different update rules

added momentum,  nesterov momentum and RMSprop
---
 code/mlp.py | 291 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 272 insertions(+), 19 deletions(-)

diff --git a/code/mlp.py b/code/mlp.py
index e865bc8f..08f5a50a 100644
--- a/code/mlp.py
+++ b/code/mlp.py
@@ -23,18 +23,153 @@
 
 __docformat__ = 'restructedtext en'
 
-
+import numpy as np
 import os
 import sys
 import timeit
-
+import csv
 import numpy
-
 import theano
 import theano.tensor as T
+import pickle
+from MnistReader import MnistReader
+from theano.printing import pydotprint
+
+from data import  load_data
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W`
+    and bias vector :math:`b`. Classification is done by projecting data
+    points onto a set of hyperplanes, the distance to which is used to
+    determine a class membership probability.
+    """
 
+    def __init__(self, input, n_in, n_out):
+        """ Initialize the parameters of the logistic regression
 
-from logistic_sgd import LogisticRegression, load_data
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+                      architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+                     which the datapoints lie
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+                      which the labels lie
+
+        """
+        # start-snippet-1
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
+        self.W = theano.shared(
+            value=numpy.zeros(
+                (n_in, n_out),
+                dtype=theano.config.floatX
+            ),
+            name='W',
+            borrow=True
+        )
+        # initialize the biases b as a vector of n_out 0s
+        self.b = theano.shared(
+            value=numpy.zeros(
+                (n_out,),
+                dtype=theano.config.floatX
+            ),
+            name='b',
+            borrow=True
+        )
+        #initialize accumulators for RMSprop
+        accW = theano.shared(value = np.zeros_like(self.W.eval(), dtype = theano.config.floatX),
+                                 name = "accW",
+                                 borrow = True,
+        )
+        accB = theano.shared(value = np.zeros_like(self.b.eval(), dtype = theano.config.floatX),
+                             name = "accB",
+                             borrow = True,
+       )
+
+        self.accs = [accW, accB]
+
+
+        # symbolic expression for computing the matrix of class-membership
+        # probabilities
+        # Where:
+        # W is a matrix where column-k represent the separation hyperplane for
+        # class-k
+        # x is a matrix where row-j  represents input training sample-j
+        # b is a vector where element-k represent the free parameter of
+        # hyperplane-k
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
+
+        # symbolic description of how to compute prediction as class whose
+        # probability is maximal
+        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
+        # end-snippet-1
+
+        # parameters of the model
+        self.params = [self.W, self.b]
+
+        # keep track of model input
+        self.input = input
+
+    def negative_log_likelihood(self, y):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
+                \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \ell (\theta=\{W,b\}, \mathcal{D})
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+
+        Note: we use the mean instead of the sum so that
+              the learning rate is less dependent on the batch size
+        """
+        # start-snippet-2
+        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
+        # number of examples (call it n) in the minibatch
+        # T.arange(y.shape[0]) is a symbolic vector which will contain
+        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
+        # Log-Probabilities (call it LP) with one row per example and
+        # one column per class LP[T.arange(y.shape[0]),y] is a vector
+        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
+        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
+        # the mean (across minibatch examples) of the elements in v,
+        # i.e., the mean log-likelihood across the minibatch.
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
+        # end-snippet-2
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch
+        over the total number of examples of the minibatch ; zero one
+        loss over the size of the minibatch
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+        """
+
+        # check if y has same dimension of y_pred
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError(
+                'y should have the same shape as self.y_pred',
+                ('y', y.type, 'y_pred', self.y_pred.type)
+            )
+        # check if y is of the correct datatype
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
 
 
 # start-snippet-1
@@ -98,10 +233,8 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None,
         if b is None:
             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
             b = theano.shared(value=b_values, name='b', borrow=True)
-
         self.W = W
         self.b = b
-
         lin_output = T.dot(input, self.W) + self.b
         self.output = (
             lin_output if activation is None
@@ -109,7 +242,14 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None,
         )
         # parameters of the model
         self.params = [self.W, self.b]
+        accW = theano.shared(value = np.zeros_like(self.W.eval(), dtype = theano.config.floatX),
+                                name = "accW",
+                                borrow = True)
+        accB = theano.shared(value = np.zeros_like(self.b.eval(), dtype = theano.config.floatX),
+                             name = "accB",
+                             borrow = True)
 
+        self.accs = [accW, accB]
 
 # start-snippet-2
 class MLP(object):
@@ -194,12 +334,16 @@ def __init__(self, rng, input, n_in, n_hidden, n_out):
         self.params = self.hiddenLayer.params + self.logRegressionLayer.params
         # end-snippet-3
 
+        #accumulators
+        self.accs = self.hiddenLayer.accs + self.logRegressionLayer.accs
+
         # keep track of model input
         self.input = input
 
 
-def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
-             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
+def test_mlp(learning_rate=0.002, L1_reg=0.00, L2_reg=0.0001, n_epochs=600,
+             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500,
+             update_rule = 'standard'):
     """
     Demonstrate stochastic gradient descent optimization for a multilayer
     perceptron
@@ -225,7 +369,8 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
     :param dataset: the path of the MNIST dataset file from
                  http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 
-
+    : type update_rule: string
+    : param update_rule: the method of updating the weights, either RMS, momentum, or nesterov
    """
     datasets = load_data(dataset)
 
@@ -294,19 +439,29 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
     # start-snippet-5
     # compute the gradient of cost with respect to theta (sorted in params)
     # the resulting gradients will be stored in a list gparams
-    gparams = [T.grad(cost, param) for param in classifier.params]
+    if update_rule != 'nesterov':
+        gparams = [T.grad(cost, param) for param in classifier.params]
 
     # specify how to update the parameters of the model as a list of
     # (variable, update expression) pairs
 
+    #init epoch here so it can be used to smoothly scale up momentum
+    epoch = 0
+
     # given two lists of the same length, A = [a1, a2, a3, a4] and
     # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
     # element is a pair formed from the two lists :
     #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
-    updates = [
-        (param, param - learning_rate * gparam)
-        for param, gparam in zip(classifier.params, gparams)
-    ]
+    if update_rule == 'standard':
+        updates = [  (param, param - learning_rate * gparam)
+       for param, gparam in zip(classifier.params, gparams)]
+    elif update_rule =='RMS':
+        updates = RMSprop(classifier.params, gparams, classifier.accs, lr = learning_rate)
+    elif update_rule == 'momentum':
+        updates = classical_momentum(classifier.params, gparams, classifier.accs, epoch, n_epochs, lr = learning_rate)
+    elif update_rule == 'nesterov':
+        updates = nesterov_momentum(classifier.params, classifier.accs, epoch, n_epochs,  cost, lr = learning_rate)
+    
 
     # compiling a Theano function `train_model` that returns the cost, but
     # in the same time updates the parameter of the model based on the rules
@@ -344,10 +499,9 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
     test_score = 0.
     start_time = timeit.default_timer()
 
-    epoch = 0
     done_looping = False
-
-    while (epoch < n_epochs) and (not done_looping):
+    validation_errors = []
+    while (epoch < n_epochs): # and (not done_looping):
         epoch = epoch + 1
         for minibatch_index in range(n_train_batches):
 
@@ -360,6 +514,7 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
                 validation_losses = [validate_model(i) for i
                                      in range(n_valid_batches)]
                 this_validation_loss = numpy.mean(validation_losses)
+                validation_errors.append(this_validation_loss * 100)
 
                 print(
                     'epoch %i, minibatch %i/%i, validation error %f %%' %
@@ -382,6 +537,9 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
 
                     best_validation_loss = this_validation_loss
                     best_iter = iter
+                    # save the best model
+                    with open('best_model_mlp.pkl', 'wb') as f:
+                        pickle.dump(classifier, f) 
 
                     # test it on the test set
                     test_losses = [test_model(i) for i
@@ -395,7 +553,7 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
 
             if patience <= iter:
                 done_looping = True
-                break
+                #break
 
     end_time = timeit.default_timer()
     print(('Optimization complete. Best validation score of %f %% '
@@ -404,7 +562,102 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
     print(('The code for file ' +
            os.path.split(__file__)[1] +
            ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
+    with open(('optimization_%s%f.csv')%(update_rule, learning_rate) , 'w') as csvfile:
+        fieldnames = ['error_validation_set',
+                      'val_freq',
+                      'minibatch/epoch',
+                     'batch_size',
+                      'learning_rate']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerow(dict([('error_validation_set', validation_errors),
+                              ('val_freq', validation_frequency),
+                              ('minibatch/epoch', n_train_batches),
+                              ('batch_size', batch_size),
+                              ('learning_rate', learning_rate)]))
+ 
+def RMSprop(params, grads, acc, lr=0.00001, rho=0.9, epsilon=1e-6):
+    updates = []
+    for p, g, acc in zip(params, grads, acc):
+        acc_new = rho * acc + (1 - rho) * g ** 2
+        gradient_scaling = T.sqrt(acc_new + epsilon)
+        g = g / gradient_scaling
+        updates.append((acc, acc_new))
+        updates.append((p, p - lr * g))
+    return updates
+
+def classical_momentum(params, grads, acc, epoch, total_epochs,
+                       lr=0.0001, decay = 0.9):
+    updates = []
+    decay = 0.5 + (decay-0.5)*epoch/total_epochs
+
+    for p, g, acc in zip(params, grads, acc):
+        acc_new = decay*acc - lr*g
+        
+        updates.append((acc, acc_new))
+        updates.append((p, p +acc))
+    return updates
+
+def nesterov_momentum(params, acc, epoch, total_epochs, cost,
+                      lr=0.0001, decay = 0.9):
+    updates = []
+    decay = 0.5 + (decay-0.5)*epoch/total_epochs
+    for p, acc in zip(params, acc):
+        step = p + decay*acc
+        gparams = [T.grad(cost, param) for param in step]
+        acc_new = decay*acc - lr*gparams
+        updates.append((acc, acc_new))
+        updates.append((p, p+ acc))
+    return updates
+
+def predict(first_ten = True):
+    """
+    An example of how to load a trained model and use it
+    to predict labels.
+    """
+
+    # load the saved model
+    classifier = pickle.load(open(r'best_model_mlp.pkl', "rb"), encoding = 'latin1')
+
+    # compile a predictor function
+    predict_model = theano.function(
+        inputs=[classifier.input],
+        outputs=classifier.logRegressionLayer.y_pred)
+
+    # We can test it on some examples from test test
+    A = MnistReader("test.csv")
+    print("loading test data....")
+    A.read_test_file()
+    
+    test_set_x = A.inputs
+#    datasets = load_data(dataset)
+#    test_set_x, test_set_y = datasets[2]
+#    test_set_x = test_set_x.get_value()
+
+    if first_ten:
+        predicted_values = predict_model(test_set_x[:10])
+    else:
+        predicted_values = predict_model(test_set_x[:])
+
+    print("Predicted values for the first 10 examples in test set:")
+    print(predicted_values)
+    
+    return predicted_values
 
 
 if __name__ == '__main__':
-    test_mlp()
+    for rule in ('momentum', 'standard', 'RMS'):
+        for rate in [0.01, 0.001, 0.0001]:
+            test_mlp(n_epochs = 60, update_rule = rule, learning_rate = rate)
+    
+    #x = predict(first_ten = False)
+
+##    with open('ans_NN.csv', 'w') as csvfile:
+##        fieldnames = ['ImageId', 'Label']
+##        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+##        writer.writeheader()
+##        for i,j in enumerate(x):
+##            writer.writerow(dict(zip(fieldnames, (i+1, j) )))
+##        
+
+