diff --git a/src/operator/correlation1D-inl.h b/src/operator/correlation1D-inl.h
index 3705441e1492..01ab828556a1 100644
--- a/src/operator/correlation1D-inl.h
+++ b/src/operator/correlation1D-inl.h
@@ -2,7 +2,7 @@
  * Copyright (c) 2015 by Contributors
  * \file correlation1D-inl.h
  * \brief correlation1D operator and symbol
- * \author Xu Dong 
+ * \author Xu Dong
 */
 #ifndef MXNET_OPERATOR_CORRELATION1D_INL_H_
 #define MXNET_OPERATOR_CORRELATION1D_INL_H_
@@ -73,10 +73,10 @@ class Correlation1DOp : public Operator {
     CHECK_EQ(out.CheckContiguous(), true);
     CHECK_EQ(tmp1.CheckContiguous(), true);
     CHECK_EQ(tmp2.CheckContiguous(), true);
-    
+
     paddedbottomheight = data1.shape_[2];
     paddedbottomwidth  = data1.shape_[3] + 2 * param_.pad_size;
-    
+
     kernel_radius_ = (param_.kernel_size - 1) / 2;
     border_size_ = param_.max_displacement + kernel_radius_;
     stride1 = param_.stride1;
@@ -90,7 +90,7 @@ class Correlation1DOp : public Operator {
        neighborhood_grid_width_ = neighborhood_grid_radius_ + 1;
     else
        neighborhood_grid_width_ = neighborhood_grid_radius_ * 2 + 1;
-    
+
     top_channels_ = neighborhood_grid_width_;
     num =  data1.shape_[0];
     channels = data1.shape_[1];
diff --git a/src/operator/correlation1D.cc b/src/operator/correlation1D.cc
index 50d1975e09be..869f432c17de 100644
--- a/src/operator/correlation1D.cc
+++ b/src/operator/correlation1D.cc
@@ -7,7 +7,7 @@
 #include "./correlation1D-inl.h"
 #include "./mshadow_op.h"
 
-namespace mshadow { 
+namespace mshadow {
 template<typename Dtype>
 inline void Correlation1DForward(const Tensor<cpu, 4, Dtype> &out,
                                const Tensor<cpu, 4, Dtype> &data1,
@@ -18,7 +18,7 @@ inline void Correlation1DForward(const Tensor<cpu, 4, Dtype> &out,
                                int pad_size_, int single_side,
                                int max_displacement_, int kernel_size_,
                                int neighborhood_grid_radius_, int neighborhood_grid_width_,
-                               int  kernel_radius_, int stride1_, int stride2_) 
+                               int  kernel_radius_, int stride1_, int stride2_)
                                {
         printf("No implementation");
 }
diff --git a/src/operator/correlation1D.cu b/src/operator/correlation1D.cu
index cc6ee8aac89b..0e73469611fe 100644
--- a/src/operator/correlation1D.cu
+++ b/src/operator/correlation1D.cu
@@ -27,22 +27,22 @@ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 namespace mshadow {
 namespace cuda {
 // == correlation1D Kernel
-template <typename Dtype> 
+template <typename Dtype>
 __global__ void Correlate1DData(const int nthreads, int num, int topwidth, int topheight, int topchannels, int topcount,
   int max_displacement, int x_shift, int neighborhood_grid_width, int kernel_radius, int kernel_size, int stride1, int stride2,
   int bottomwidth, int bottomheight, int bottomchannels,
-  const Dtype *bottom0, const Dtype *bottom1, Dtype *top) 
+  const Dtype *bottom0, const Dtype *bottom1, Dtype *top)
 {
   extern __shared__ char patch_data_char[];
-  
+
   Dtype *patch_data = (Dtype *)patch_data_char;
-  
+
     // First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1
   int x1 = blockIdx.x*stride1 + max_displacement;
   int y1 = blockIdx.y*stride1;
   int item = blockIdx.z;
   int ch_off = threadIdx.x;
-  
+
   // Load 3D patch into shared shared memory
   for(int j = 0; j < kernel_size; j++) { // HEIGHT
     for(int i = 0; i < kernel_size; i++) { // WIDTH
@@ -54,33 +54,33 @@ __global__ void Correlate1DData(const int nthreads, int num, int topwidth, int t
       }
     }
   }
-  
+
   __syncthreads();
-  
+
   __shared__ Dtype sum[WARPS_PER_BLOCK*THREADS_PER_WARP];
-  
-  // Compute 
+
+  // Compute
   for(int top_channel = 0; top_channel < topchannels; top_channel++) {
     sum[ch_off] = 0;
-  
+
     int s2o = (top_channel % neighborhood_grid_width + x_shift) * stride2;
-    
+
     for(int j = 0; j < kernel_size; j++) { // HEIGHT
       for(int i = 0; i < kernel_size; i++) { // WIDTH
         int ji_off = ((j * kernel_size) + i) * bottomchannels;
         for(int ch = ch_off; ch < bottomchannels; ch += (WARPS_PER_BLOCK*THREADS_PER_WARP)) { // CHANNELS
           int x2 = x1 + s2o;
-          
+
           int idxPatchData = ji_off + ch;
           int idx2 = ((item * bottomheight + y1+j) * bottomwidth + x2+i) * bottomchannels + ch;
-          
+
           sum[ch_off] += patch_data[idxPatchData] * bottom1[idx2];
         }
       }
     }
-    
+
     __syncthreads();
-    
+
     if(ch_off == 0) {
         Dtype total_sum = 0;
         for(int idx = 0; idx < WARPS_PER_BLOCK*THREADS_PER_WARP; idx++) {
@@ -90,8 +90,8 @@ __global__ void Correlate1DData(const int nthreads, int num, int topwidth, int t
         const int index = ((top_channel*topheight + blockIdx.y)*topwidth)+blockIdx.x;
         top[index + item*topcount] = total_sum / (float)sumelems;
     }
-  } 
-  // Aggregate  
+  }
+  // Aggregate
 }
 
 //  == correlation1D Backward Pass Kernel (For data1)
@@ -113,15 +113,15 @@ __global__ void Correlate1DDataBackward0(const int nthreads, int num, int item,
     // We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
-    
+
     // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
     int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1
     int ymin = (m - 2*kernel_radius - 0 + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1
-    
+
     // Same here:
     int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off; // floor (l - max_displacement) / stride1
     int ymax = (m - 0 + round_off_s1) / stride1 - round_off; // floor (m - max_displacement) / stride1
-    
+
 
     Dtype sum = 0;
     if(xmax>=0 && ymax>=0 && (xmin<=topwidth-1) && (ymin<=topheight-1))
@@ -175,24 +175,24 @@ __global__ void Correlate1DDataBackward1(const int nthreads,
     int n = index % bottomchannels; //channels
     int l = (index / bottomchannels) % bottomwidth + pad_size; //w-pos
     int m = (index / bottomchannels / bottomwidth) % bottomheight; //h-pos
-    
+
     // round_off is a trick to enable integer division with ceil, even for negative numbers
     // We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
-    
+
     Dtype sum = 0;
     {
-      
+
       for(int o = x_shift; o < x_shift + neighborhood_grid_width; o++) {
-        
+
         int s2o = stride2 * o;
-        
+
         //Get X,Y ranges and clamp
         // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
         int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
         int ymin = (m - 2*kernel_radius - 0 - 0 + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
-        
+
         // Same here:
         int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off; // floor (l - max_displacement - s2o) / stride1
         int ymax = (m - 0 - 0 + round_off_s1) / stride1 - round_off; // floor (m - max_displacement - 0) / stride1
@@ -249,7 +249,7 @@ int channels, int width, int height, int widthheight, int padding, int pwidthhei
     int ypad  = (xy / width + 0);
     int xypad = ypad * (width+2*padding) + xpad;
 
-    out[(n*pwidthheight+xypad)*channels + ch] = value;   
+    out[(n*pwidthheight+xypad)*channels + ch] = value;
 }
 template <typename Dtype>
 void Forward_gpu(
@@ -287,10 +287,10 @@ void Forward_gpu(
     const int height = bheight;
     const int width = bwidth + 2 * pad_size_;
     const int shared_memory_per_block = (kernel_size_ * kernel_size_) * bchannels;
-     
+
     int x_shift = - neighborhood_grid_radius_;
     if(single_side == -1) { // to the left
-      x_shift = -neighborhood_grid_width_;
+      x_shift = - neighborhood_grid_radius_;
     } else if(single_side == 1) { // to the right
       x_shift = 0;
     }
@@ -335,10 +335,10 @@ void Backward_gpu(
     int botThreadCount = bottomcount;
     const int gridSize = (botThreadCount + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
     //  correlation1DLayerBackward
-    
+
     int x_shift = - neighborhood_grid_radius_;
     if (single_side == -1) { // to the left
-      x_shift = -neighborhood_grid_width_;
+      x_shift = -neighborhood_grid_radius_;
     } else if(single_side == 1) { // to the right
       x_shift = 0;
     }