Skip to content

Commit

Permalink
testing different update rules
Browse files Browse the repository at this point in the history
added momentum,  nesterov momentum and RMSprop
  • Loading branch information
esmason authored Jul 1, 2016
1 parent de99c6e commit 6e19044
Showing 1 changed file with 272 additions and 19 deletions.
291 changes: 272 additions & 19 deletions code/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,153 @@

__docformat__ = 'restructedtext en'


import numpy as np
import os
import sys
import timeit

import csv
import numpy

import theano
import theano.tensor as T
import pickle
from MnistReader import MnistReader
from theano.printing import pydotprint

from data import load_data

class LogisticRegression(object):
"""Multi-class Logistic Regression Class
The logistic regression is fully described by a weight matrix :math:`W`
and bias vector :math:`b`. Classification is done by projecting data
points onto a set of hyperplanes, the distance to which is used to
determine a class membership probability.
"""

def __init__(self, input, n_in, n_out):
""" Initialize the parameters of the logistic regression
from logistic_sgd import LogisticRegression, load_data
:type input: theano.tensor.TensorType
:param input: symbolic variable that describes the input of the
architecture (one minibatch)
:type n_in: int
:param n_in: number of input units, the dimension of the space in
which the datapoints lie
:type n_out: int
:param n_out: number of output units, the dimension of the space in
which the labels lie
"""
# start-snippet-1
# initialize with 0 the weights W as a matrix of shape (n_in, n_out)
self.W = theano.shared(
value=numpy.zeros(
(n_in, n_out),
dtype=theano.config.floatX
),
name='W',
borrow=True
)
# initialize the biases b as a vector of n_out 0s
self.b = theano.shared(
value=numpy.zeros(
(n_out,),
dtype=theano.config.floatX
),
name='b',
borrow=True
)
#initialize accumulators for RMSprop
accW = theano.shared(value = np.zeros_like(self.W.eval(), dtype = theano.config.floatX),
name = "accW",
borrow = True,
)
accB = theano.shared(value = np.zeros_like(self.b.eval(), dtype = theano.config.floatX),
name = "accB",
borrow = True,
)

self.accs = [accW, accB]


# symbolic expression for computing the matrix of class-membership
# probabilities
# Where:
# W is a matrix where column-k represent the separation hyperplane for
# class-k
# x is a matrix where row-j represents input training sample-j
# b is a vector where element-k represent the free parameter of
# hyperplane-k
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)

# symbolic description of how to compute prediction as class whose
# probability is maximal
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
# end-snippet-1

# parameters of the model
self.params = [self.W, self.b]

# keep track of model input
self.input = input

def negative_log_likelihood(self, y):
"""Return the mean of the negative log-likelihood of the prediction
of this model under a given target distribution.
.. math::
\frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
\frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
\log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
\ell (\theta=\{W,b\}, \mathcal{D})
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example the
correct label
Note: we use the mean instead of the sum so that
the learning rate is less dependent on the batch size
"""
# start-snippet-2
# y.shape[0] is (symbolically) the number of rows in y, i.e.,
# number of examples (call it n) in the minibatch
# T.arange(y.shape[0]) is a symbolic vector which will contain
# [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
# Log-Probabilities (call it LP) with one row per example and
# one column per class LP[T.arange(y.shape[0]),y] is a vector
# v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
# LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
# the mean (across minibatch examples) of the elements in v,
# i.e., the mean log-likelihood across the minibatch.
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
# end-snippet-2

def errors(self, y):
"""Return a float representing the number of errors in the minibatch
over the total number of examples of the minibatch ; zero one
loss over the size of the minibatch
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example the
correct label
"""

# check if y has same dimension of y_pred
if y.ndim != self.y_pred.ndim:
raise TypeError(
'y should have the same shape as self.y_pred',
('y', y.type, 'y_pred', self.y_pred.type)
)
# check if y is of the correct datatype
if y.dtype.startswith('int'):
# the T.neq operator returns a vector of 0s and 1s, where 1
# represents a mistake in prediction
return T.mean(T.neq(self.y_pred, y))
else:
raise NotImplementedError()


# start-snippet-1
Expand Down Expand Up @@ -98,18 +233,23 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None,
if b is None:
b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
b = theano.shared(value=b_values, name='b', borrow=True)

self.W = W
self.b = b

lin_output = T.dot(input, self.W) + self.b
self.output = (
lin_output if activation is None
else activation(lin_output)
)
# parameters of the model
self.params = [self.W, self.b]
accW = theano.shared(value = np.zeros_like(self.W.eval(), dtype = theano.config.floatX),
name = "accW",
borrow = True)
accB = theano.shared(value = np.zeros_like(self.b.eval(), dtype = theano.config.floatX),
name = "accB",
borrow = True)

self.accs = [accW, accB]

# start-snippet-2
class MLP(object):
Expand Down Expand Up @@ -194,12 +334,16 @@ def __init__(self, rng, input, n_in, n_hidden, n_out):
self.params = self.hiddenLayer.params + self.logRegressionLayer.params
# end-snippet-3

#accumulators
self.accs = self.hiddenLayer.accs + self.logRegressionLayer.accs

# keep track of model input
self.input = input


def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
def test_mlp(learning_rate=0.002, L1_reg=0.00, L2_reg=0.0001, n_epochs=600,
dataset='mnist.pkl.gz', batch_size=20, n_hidden=500,
update_rule = 'standard'):
"""
Demonstrate stochastic gradient descent optimization for a multilayer
perceptron
Expand All @@ -225,7 +369,8 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
:param dataset: the path of the MNIST dataset file from
http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
: type update_rule: string
: param update_rule: the method of updating the weights, either RMS, momentum, or nesterov
"""
datasets = load_data(dataset)

Expand Down Expand Up @@ -294,19 +439,29 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
# start-snippet-5
# compute the gradient of cost with respect to theta (sorted in params)
# the resulting gradients will be stored in a list gparams
gparams = [T.grad(cost, param) for param in classifier.params]
if update_rule != 'nesterov':
gparams = [T.grad(cost, param) for param in classifier.params]

# specify how to update the parameters of the model as a list of
# (variable, update expression) pairs

#init epoch here so it can be used to smoothly scale up momentum
epoch = 0

# given two lists of the same length, A = [a1, a2, a3, a4] and
# B = [b1, b2, b3, b4], zip generates a list C of same size, where each
# element is a pair formed from the two lists :
# C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
updates = [
(param, param - learning_rate * gparam)
for param, gparam in zip(classifier.params, gparams)
]
if update_rule == 'standard':
updates = [ (param, param - learning_rate * gparam)
for param, gparam in zip(classifier.params, gparams)]
elif update_rule =='RMS':
updates = RMSprop(classifier.params, gparams, classifier.accs, lr = learning_rate)
elif update_rule == 'momentum':
updates = classical_momentum(classifier.params, gparams, classifier.accs, epoch, n_epochs, lr = learning_rate)
elif update_rule == 'nesterov':
updates = nesterov_momentum(classifier.params, classifier.accs, epoch, n_epochs, cost, lr = learning_rate)


# compiling a Theano function `train_model` that returns the cost, but
# in the same time updates the parameter of the model based on the rules
Expand Down Expand Up @@ -344,10 +499,9 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
test_score = 0.
start_time = timeit.default_timer()

epoch = 0
done_looping = False

while (epoch < n_epochs) and (not done_looping):
validation_errors = []
while (epoch < n_epochs): # and (not done_looping):
epoch = epoch + 1
for minibatch_index in range(n_train_batches):

Expand All @@ -360,6 +514,7 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
validation_losses = [validate_model(i) for i
in range(n_valid_batches)]
this_validation_loss = numpy.mean(validation_losses)
validation_errors.append(this_validation_loss * 100)

print(
'epoch %i, minibatch %i/%i, validation error %f %%' %
Expand All @@ -382,6 +537,9 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,

best_validation_loss = this_validation_loss
best_iter = iter
# save the best model
with open('best_model_mlp.pkl', 'wb') as f:
pickle.dump(classifier, f)

# test it on the test set
test_losses = [test_model(i) for i
Expand All @@ -395,7 +553,7 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,

if patience <= iter:
done_looping = True
break
#break

end_time = timeit.default_timer()
print(('Optimization complete. Best validation score of %f %% '
Expand All @@ -404,7 +562,102 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
print(('The code for file ' +
os.path.split(__file__)[1] +
' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
with open(('optimization_%s%f.csv')%(update_rule, learning_rate) , 'w') as csvfile:
fieldnames = ['error_validation_set',
'val_freq',
'minibatch/epoch',
'batch_size',
'learning_rate']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(dict([('error_validation_set', validation_errors),
('val_freq', validation_frequency),
('minibatch/epoch', n_train_batches),
('batch_size', batch_size),
('learning_rate', learning_rate)]))

def RMSprop(params, grads, acc, lr=0.00001, rho=0.9, epsilon=1e-6):
updates = []
for p, g, acc in zip(params, grads, acc):
acc_new = rho * acc + (1 - rho) * g ** 2
gradient_scaling = T.sqrt(acc_new + epsilon)
g = g / gradient_scaling
updates.append((acc, acc_new))
updates.append((p, p - lr * g))
return updates

def classical_momentum(params, grads, acc, epoch, total_epochs,
lr=0.0001, decay = 0.9):
updates = []
decay = 0.5 + (decay-0.5)*epoch/total_epochs

for p, g, acc in zip(params, grads, acc):
acc_new = decay*acc - lr*g

updates.append((acc, acc_new))
updates.append((p, p +acc))
return updates

def nesterov_momentum(params, acc, epoch, total_epochs, cost,
lr=0.0001, decay = 0.9):
updates = []
decay = 0.5 + (decay-0.5)*epoch/total_epochs
for p, acc in zip(params, acc):
step = p + decay*acc
gparams = [T.grad(cost, param) for param in step]
acc_new = decay*acc - lr*gparams
updates.append((acc, acc_new))
updates.append((p, p+ acc))
return updates

def predict(first_ten = True):
"""
An example of how to load a trained model and use it
to predict labels.
"""

# load the saved model
classifier = pickle.load(open(r'best_model_mlp.pkl', "rb"), encoding = 'latin1')

# compile a predictor function
predict_model = theano.function(
inputs=[classifier.input],
outputs=classifier.logRegressionLayer.y_pred)

# We can test it on some examples from test test
A = MnistReader("test.csv")
print("loading test data....")
A.read_test_file()

test_set_x = A.inputs
# datasets = load_data(dataset)
# test_set_x, test_set_y = datasets[2]
# test_set_x = test_set_x.get_value()

if first_ten:
predicted_values = predict_model(test_set_x[:10])
else:
predicted_values = predict_model(test_set_x[:])

print("Predicted values for the first 10 examples in test set:")
print(predicted_values)

return predicted_values


if __name__ == '__main__':
test_mlp()
for rule in ('momentum', 'standard', 'RMS'):
for rate in [0.01, 0.001, 0.0001]:
test_mlp(n_epochs = 60, update_rule = rule, learning_rate = rate)

#x = predict(first_ten = False)

## with open('ans_NN.csv', 'w') as csvfile:
## fieldnames = ['ImageId', 'Label']
## writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
## writer.writeheader()
## for i,j in enumerate(x):
## writer.writerow(dict(zip(fieldnames, (i+1, j) )))
##


0 comments on commit 6e19044

Please sign in to comment.