diff --git a/cnn/cnnConvolve.m b/cnn/cnnConvolve.m
index 0c793b7..45bbee2 100644
--- a/cnn/cnnConvolve.m
+++ b/cnn/cnnConvolve.m
@@ -41,7 +41,9 @@
 
     % convolution of image with feature matrix
     convolvedImage = zeros(convDim, convDim);
+
     % Obtain the feature (filterDim x filterDim) needed during the convolution
+
     %%% YOUR CODE HERE %%%
 
     % Flip the feature matrix because of the definition of convolution, as explained later
@@ -52,13 +54,13 @@
 
     % Convolve "filter" with "im", adding the result to convolvedImage
     % be sure to do a 'valid' convolution
-    %%% YOUR CODE HERE %%%
 
+    %%% YOUR CODE HERE %%%
     
     % Add the bias unit
     % Then, apply the sigmoid function to get the hidden activation
-    %%% YOUR CODE HERE %%%
 
+    %%% YOUR CODE HERE %%%
 
     
     convolvedFeatures(:, :, filterNum, imageNum) = convolvedImage;
diff --git a/cnn/cnnCost.m b/cnn/cnnCost.m
index 1feebf8..d56af2f 100644
--- a/cnn/cnnCost.m
+++ b/cnn/cnnCost.m
@@ -1,16 +1,19 @@
 function [cost, grad, preds] = cnnCost(theta,images,labels,numClasses,...
                                 filterDim,numFilters,poolDim,pred)
-% Calcualte cost and gradient for a single layer convolutional neural
-% network followed by a softmax layer with cross entropy objective.
+% Calcualte cost and gradient for a single layer convolutional
+% neural network followed by a softmax layer with cross entropy
+% objective.
 %                            
 % Parameters:
 %  theta      -  unrolled parameter vector
-%  images     -  stores images in imageDim x imageDim x numImges array
+%  images     -  stores images in imageDim x imageDim x numImges
+%                array
 %  numClasses -  number of classes to predict
 %  filterDim  -  dimension of convolutional filter                            
 %  numFilters -  number of convolutional filters
 %  poolDim    -  dimension of pooling area
-%  pred       -  boolean only forward propagate and return predictions
+%  pred       -  boolean only forward propagate and return
+%                predictions
 %
 %
 % Returns:
@@ -32,19 +35,18 @@
 % Wc is filterDim x filterDim x numFilters parameter matrix
 % bc is the corresponding bias
 
-% Wd is numClasses x hiddenSize parameter matrix where hiddenSize is the
-% number of output units from the convolutional layer
+% Wd is numClasses x hiddenSize parameter matrix where hiddenSize
+% is the number of output units from the convolutional layer
 % bd is corresponding bias
 [Wc, Wd, bc, bd] = cnnParamsToStack(theta,imageDim,filterDim,numFilters,...
                         poolDim,numClasses);
 
-% Same sizes as Wc,Wd,bc,bd.  Used to hold gradient w.r.t above params.
+% Same sizes as Wc,Wd,bc,bd. Used to hold gradient w.r.t above params.
 Wc_grad = zeros(size(Wc));
 Wd_grad = zeros(size(Wd));
 bc_grad = zeros(size(bc));
 bd_grad = zeros(size(bd));
 
-
 %%======================================================================
 %% STEP 1a: Forward Propagation
 %  In this step you will forward propagate the input through the
@@ -71,7 +73,6 @@
 
 %%% YOUR CODE HERE %%%
 
-
 % Reshape activations into 2-d matrix, hiddenSize x numImages,
 % for Softmax layer
 activationsPooled = reshape(activationsPooled,[],numImages);
@@ -88,7 +89,6 @@
 
 %%% YOUR CODE HERE %%%
 
-
 %%======================================================================
 %% STEP 1b: Calculate Cost
 %  In this step you will use the labels given as input and the probs
@@ -99,7 +99,6 @@
 
 %%% YOUR CODE HERE %%%
 
-
 % Makes predictions given probs and returns without backproagating errors.
 if pred
     [~,preds] = max(probs,[],1);
@@ -121,17 +120,16 @@
 %%% YOUR CODE HERE %%%
 
 %%======================================================================
-%% STEP 1c: Gradient Calculation
+%% STEP 1d: Gradient Calculation
 %  After backpropagating the errors above, we can use them to calculate the
 %  gradient with respect to all the parameters.  The gradient w.r.t the
 %  softmax layer is calculated as usual.  To calculate the gradient w.r.t.
 %  a filter in the convolutional layer, convolve the backpropagated error
-%  for that fileter with each image and aggregate over images.
+%  for that filter with each image and aggregate over images.
 
 %%% YOUR CODE HERE %%%
 
-
 %% Unroll gradient into grad vector for minFunc
 grad = [Wc_grad(:) ; Wd_grad(:) ; bc_grad(:) ; bd_grad(:)];
 
-end
\ No newline at end of file
+end
diff --git a/cnn/cnnInitParams.m b/cnn/cnnInitParams.m
index 1ae7045..e38fd85 100644
--- a/cnn/cnnInitParams.m
+++ b/cnn/cnnInitParams.m
@@ -28,7 +28,8 @@
 outDim = outDim/poolDim;
 hiddenSize = outDim^2*numFilters;
 
-r  = sqrt(6) / sqrt(numClasses+hiddenSize+1);   % we'll choose weights uniformly from the interval [-r, r]
+% we'll choose weights uniformly from the interval [-r, r]
+r  = sqrt(6) / sqrt(numClasses+hiddenSize+1);
 Wd = rand(numClasses, hiddenSize) * 2 * r - r;
 
 bc = zeros(numFilters, 1);
diff --git a/cnn/cnnPool.m b/cnn/cnnPool.m
index 9e039f7..c740915 100644
--- a/cnn/cnnPool.m
+++ b/cnn/cnnPool.m
@@ -31,6 +31,5 @@
 
 %%% YOUR CODE HERE %%%
 
-
 end
 
diff --git a/cnn/cnnTrain.m b/cnn/cnnTrain.m
index 5ac045a..fe034f3 100644
--- a/cnn/cnnTrain.m
+++ b/cnn/cnnTrain.m
@@ -15,8 +15,8 @@
 % Configuration
 imageDim = 28;
 numClasses = 10;  % Number of classes (MNIST images fall into 10 classes)
-filterDim = 9;    % Filter size for conv layer (should divide imageDim)
-numFilters = 10;   % Number of filters for conv layer
+filterDim = 9;    % Filter size for conv layer
+numFilters = 20;   % Number of filters for conv layer
 poolDim = 2;      % Pooling dimension, (should divide imageDim-filterDim+1)
 
 % Load MNIST Train
@@ -39,7 +39,7 @@
 %  calculation for your cnnCost.m function.  You may need to add the
 %  appropriate path or copy the file to this directory.
 
-DEBUG=true;  % set this to true to check gradient
+DEBUG=false;  % set this to true to check gradient
 if DEBUG
     % To speed up gradient checking, we will use a reduced network and
     % a debugging data set
diff --git a/cnn/minFuncSGD.m b/cnn/minFuncSGD.m
index b62d518..3571064 100644
--- a/cnn/minFuncSGD.m
+++ b/cnn/minFuncSGD.m
@@ -1,11 +1,12 @@
 function [opttheta] = minFuncSGD(funObj,theta,data,labels,...
                         options)
-% Runs stochastic gradient descent with momentum to optimize the parameters
-% for the given objective.
+% Runs stochastic gradient descent with momentum to optimize the
+% parameters for the given objective.
 %
 % Parameters:
-%  funObj     -  function handle which accepts as input theta, data, labels
-%                and returns cost and gradient w.r.t to theta.
+%  funObj     -  function handle which accepts as input theta,
+%                data, labels and returns cost and gradient w.r.t
+%                to theta.
 %  theta      -  unrolled parameter vector
 %  data       -  stores data in m x n x numExamples tensor
 %  labels     -  corresponding labels in numExamples x 1 vector
@@ -34,7 +35,7 @@
 m = length(labels); % training set size
 % Setup for momentum
 mom = 0.5;
-momIncrease = 10;
+momIncrease = 20;
 velocity = zeros(size(theta));
 
 %%======================================================================
@@ -47,7 +48,7 @@
     
     for s=1:minibatch:(m-minibatch+1)
         it = it + 1;
-    
+
         % increase momentum after momIncrease iterations
         if it == momIncrease
             mom = options.momentum;
@@ -60,10 +61,10 @@
         % evaluate the objective function on the next minibatch
         [cost grad] = funObj(theta,mb_data,mb_labels);
         
-        % Instructions: Add in the weighted velocity vector to the gradient
-        % evaluated above.  Then update the current weights theta according
-        % to the sgd update rule with alpha as the learning rate.  Finally
-        % update the velocity vector.
+        % Instructions: Add in the weighted velocity vector to the
+        % gradient evaluated above scaled by the learning rate.
+        % Then update the current weights theta according to the
+        % sgd update rule
         
         %%% YOUR CODE HERE %%%
         
@@ -77,4 +78,4 @@
 
 opttheta = theta;
 
-end
\ No newline at end of file
+end
diff --git a/rica/runSoftICA.m b/rica/runSoftICA.m
index f369ea6..c99ffbe 100644
--- a/rica/runSoftICA.m
+++ b/rica/runSoftICA.m
@@ -24,10 +24,11 @@
 % Step 1) Sample patches
 patches = samplePatches(data,params.patchWidth,params.m);
 % Step 2) Apply ZCA
-%%% YOUR CODE HERE %%%
+patches = zca2(patches);
 % Step 3) Normalize each patch. Each patch should be normalized as
 % x / ||x||_2 where x is the vector representation of the patch
-%%% YOUR CODE HERE %%%
+m = sqrt(sum(patches.^2) + (1e-8));
+x = bsxfunwrap(@rdivide,patches,m);
 
 %% Run the optimization
 options.Method = 'lbfgs';
diff --git a/stl/feedfowardRICA.m b/stl/feedfowardRICA.m
index 0f56d70..4c073b9 100644
--- a/stl/feedfowardRICA.m
+++ b/stl/feedfowardRICA.m
@@ -58,7 +58,7 @@
     % be sure to do a 'valid' convolution
     % ---- YOUR CODE HERE ----
     resp = conv2(im,filter,'valid');
-      
+    % ------------------------      
     % Then, apply square-square-root pooling on "resp" to get the hidden
     % activation "act"
     act = zeros(convDim / poolDim, convDim / poolDim); % You should replace this
diff --git a/stl/stlExercise.m b/stl/stlExercise.m
index 1aefdd6..fb54c3d 100644
--- a/stl/stlExercise.m
+++ b/stl/stlExercise.m
@@ -131,11 +131,12 @@
 
 %  Use minFunc and softmax_regression_vec from the previous exercise to 
 %  train a multi-class classifier. 
-%% ----------------- YOUR CODE HERE ----------------------
 options.Method = 'lbfgs';
 options.MaxFunEvals = Inf;
 options.MaxIter = 300;
+
 % optimize
+%% ----------------- YOUR CODE HERE ----------------------
 [opttheta_softmax, cost, exitflag] = minFunc( @(theta) softmax_regression_vec(theta, trainFeatures, trainLabels), randTheta2, options);
 
 %% -----------------------------------------------------