From 6e19044d440513b40bdde40616b9acd8d1d9f4dd Mon Sep 17 00:00:00 2001 From: esmason Date: Fri, 1 Jul 2016 14:11:31 -0700 Subject: [PATCH] testing different update rules added momentum, nesterov momentum and RMSprop --- code/mlp.py | 291 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 272 insertions(+), 19 deletions(-) diff --git a/code/mlp.py b/code/mlp.py index e865bc8f..08f5a50a 100644 --- a/code/mlp.py +++ b/code/mlp.py @@ -23,18 +23,153 @@ __docformat__ = 'restructedtext en' - +import numpy as np import os import sys import timeit - +import csv import numpy - import theano import theano.tensor as T +import pickle +from MnistReader import MnistReader +from theano.printing import pydotprint + +from data import load_data + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression -from logistic_sgd import LogisticRegression, load_data + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + # start-snippet-1 + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( + value=numpy.zeros( + (n_in, n_out), + dtype=theano.config.floatX + ), + name='W', + borrow=True + ) + # initialize the biases b as a vector of n_out 0s + self.b = theano.shared( + value=numpy.zeros( + (n_out,), + dtype=theano.config.floatX + ), + name='b', + borrow=True + ) + #initialize accumulators for RMSprop + accW = theano.shared(value = np.zeros_like(self.W.eval(), dtype = theano.config.floatX), + name = "accW", + borrow = True, + ) + accB = theano.shared(value = np.zeros_like(self.b.eval(), dtype = theano.config.floatX), + name = "accB", + borrow = True, + ) + + self.accs = [accW, accB] + + + # symbolic expression for computing the matrix of class-membership + # probabilities + # Where: + # W is a matrix where column-k represent the separation hyperplane for + # class-k + # x is a matrix where row-j represents input training sample-j + # b is a vector where element-k represent the free parameter of + # hyperplane-k + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) + + # symbolic description of how to compute prediction as class whose + # probability is maximal + self.y_pred = T.argmax(self.p_y_given_x, axis=1) + # end-snippet-1 + + # parameters of the model + self.params = [self.W, self.b] + + # keep track of model input + self.input = input + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} + \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + # start-snippet-2 + # y.shape[0] is (symbolically) the number of rows in y, i.e., + # number of examples (call it n) in the minibatch + # T.arange(y.shape[0]) is a symbolic vector which will contain + # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of + # Log-Probabilities (call it LP) with one row per example and + # one column per class LP[T.arange(y.shape[0]),y] is a vector + # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., + # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is + # the mean (across minibatch examples) of the elements in v, + # i.e., the mean log-likelihood across the minibatch. + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) + # end-snippet-2 + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError( + 'y should have the same shape as self.y_pred', + ('y', y.type, 'y_pred', self.y_pred.type) + ) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() # start-snippet-1 @@ -98,10 +233,8 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None, if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) - self.W = W self.b = b - lin_output = T.dot(input, self.W) + self.b self.output = ( lin_output if activation is None @@ -109,7 +242,14 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None, ) # parameters of the model self.params = [self.W, self.b] + accW = theano.shared(value = np.zeros_like(self.W.eval(), dtype = theano.config.floatX), + name = "accW", + borrow = True) + accB = theano.shared(value = np.zeros_like(self.b.eval(), dtype = theano.config.floatX), + name = "accB", + borrow = True) + self.accs = [accW, accB] # start-snippet-2 class MLP(object): @@ -194,12 +334,16 @@ def __init__(self, rng, input, n_in, n_hidden, n_out): self.params = self.hiddenLayer.params + self.logRegressionLayer.params # end-snippet-3 + #accumulators + self.accs = self.hiddenLayer.accs + self.logRegressionLayer.accs + # keep track of model input self.input = input -def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, - dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): +def test_mlp(learning_rate=0.002, L1_reg=0.00, L2_reg=0.0001, n_epochs=600, + dataset='mnist.pkl.gz', batch_size=20, n_hidden=500, + update_rule = 'standard'): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron @@ -225,7 +369,8 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz - + : type update_rule: string + : param update_rule: the method of updating the weights, either RMS, momentum, or nesterov """ datasets = load_data(dataset) @@ -294,19 +439,29 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, # start-snippet-5 # compute the gradient of cost with respect to theta (sorted in params) # the resulting gradients will be stored in a list gparams - gparams = [T.grad(cost, param) for param in classifier.params] + if update_rule != 'nesterov': + gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs + #init epoch here so it can be used to smoothly scale up momentum + epoch = 0 + # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] - updates = [ - (param, param - learning_rate * gparam) - for param, gparam in zip(classifier.params, gparams) - ] + if update_rule == 'standard': + updates = [ (param, param - learning_rate * gparam) + for param, gparam in zip(classifier.params, gparams)] + elif update_rule =='RMS': + updates = RMSprop(classifier.params, gparams, classifier.accs, lr = learning_rate) + elif update_rule == 'momentum': + updates = classical_momentum(classifier.params, gparams, classifier.accs, epoch, n_epochs, lr = learning_rate) + elif update_rule == 'nesterov': + updates = nesterov_momentum(classifier.params, classifier.accs, epoch, n_epochs, cost, lr = learning_rate) + # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules @@ -344,10 +499,9 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, test_score = 0. start_time = timeit.default_timer() - epoch = 0 done_looping = False - - while (epoch < n_epochs) and (not done_looping): + validation_errors = [] + while (epoch < n_epochs): # and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): @@ -360,6 +514,7 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) + validation_errors.append(this_validation_loss * 100) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % @@ -382,6 +537,9 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, best_validation_loss = this_validation_loss best_iter = iter + # save the best model + with open('best_model_mlp.pkl', 'wb') as f: + pickle.dump(classifier, f) # test it on the test set test_losses = [test_model(i) for i @@ -395,7 +553,7 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, if patience <= iter: done_looping = True - break + #break end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' @@ -404,7 +562,102 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) + with open(('optimization_%s%f.csv')%(update_rule, learning_rate) , 'w') as csvfile: + fieldnames = ['error_validation_set', + 'val_freq', + 'minibatch/epoch', + 'batch_size', + 'learning_rate'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + writer.writerow(dict([('error_validation_set', validation_errors), + ('val_freq', validation_frequency), + ('minibatch/epoch', n_train_batches), + ('batch_size', batch_size), + ('learning_rate', learning_rate)])) + +def RMSprop(params, grads, acc, lr=0.00001, rho=0.9, epsilon=1e-6): + updates = [] + for p, g, acc in zip(params, grads, acc): + acc_new = rho * acc + (1 - rho) * g ** 2 + gradient_scaling = T.sqrt(acc_new + epsilon) + g = g / gradient_scaling + updates.append((acc, acc_new)) + updates.append((p, p - lr * g)) + return updates + +def classical_momentum(params, grads, acc, epoch, total_epochs, + lr=0.0001, decay = 0.9): + updates = [] + decay = 0.5 + (decay-0.5)*epoch/total_epochs + + for p, g, acc in zip(params, grads, acc): + acc_new = decay*acc - lr*g + + updates.append((acc, acc_new)) + updates.append((p, p +acc)) + return updates + +def nesterov_momentum(params, acc, epoch, total_epochs, cost, + lr=0.0001, decay = 0.9): + updates = [] + decay = 0.5 + (decay-0.5)*epoch/total_epochs + for p, acc in zip(params, acc): + step = p + decay*acc + gparams = [T.grad(cost, param) for param in step] + acc_new = decay*acc - lr*gparams + updates.append((acc, acc_new)) + updates.append((p, p+ acc)) + return updates + +def predict(first_ten = True): + """ + An example of how to load a trained model and use it + to predict labels. + """ + + # load the saved model + classifier = pickle.load(open(r'best_model_mlp.pkl', "rb"), encoding = 'latin1') + + # compile a predictor function + predict_model = theano.function( + inputs=[classifier.input], + outputs=classifier.logRegressionLayer.y_pred) + + # We can test it on some examples from test test + A = MnistReader("test.csv") + print("loading test data....") + A.read_test_file() + + test_set_x = A.inputs +# datasets = load_data(dataset) +# test_set_x, test_set_y = datasets[2] +# test_set_x = test_set_x.get_value() + + if first_ten: + predicted_values = predict_model(test_set_x[:10]) + else: + predicted_values = predict_model(test_set_x[:]) + + print("Predicted values for the first 10 examples in test set:") + print(predicted_values) + + return predicted_values if __name__ == '__main__': - test_mlp() + for rule in ('momentum', 'standard', 'RMS'): + for rate in [0.01, 0.001, 0.0001]: + test_mlp(n_epochs = 60, update_rule = rule, learning_rate = rate) + + #x = predict(first_ten = False) + +## with open('ans_NN.csv', 'w') as csvfile: +## fieldnames = ['ImageId', 'Label'] +## writer = csv.DictWriter(csvfile, fieldnames=fieldnames) +## writer.writeheader() +## for i,j in enumerate(x): +## writer.writerow(dict(zip(fieldnames, (i+1, j) ))) +## + +