diff --git a/src/operator/correlation1D-inl.h b/src/operator/correlation1D-inl.h index 3705441e1492..01ab828556a1 100644 --- a/src/operator/correlation1D-inl.h +++ b/src/operator/correlation1D-inl.h @@ -2,7 +2,7 @@ * Copyright (c) 2015 by Contributors * \file correlation1D-inl.h * \brief correlation1D operator and symbol - * \author Xu Dong + * \author Xu Dong */ #ifndef MXNET_OPERATOR_CORRELATION1D_INL_H_ #define MXNET_OPERATOR_CORRELATION1D_INL_H_ @@ -73,10 +73,10 @@ class Correlation1DOp : public Operator { CHECK_EQ(out.CheckContiguous(), true); CHECK_EQ(tmp1.CheckContiguous(), true); CHECK_EQ(tmp2.CheckContiguous(), true); - + paddedbottomheight = data1.shape_[2]; paddedbottomwidth = data1.shape_[3] + 2 * param_.pad_size; - + kernel_radius_ = (param_.kernel_size - 1) / 2; border_size_ = param_.max_displacement + kernel_radius_; stride1 = param_.stride1; @@ -90,7 +90,7 @@ class Correlation1DOp : public Operator { neighborhood_grid_width_ = neighborhood_grid_radius_ + 1; else neighborhood_grid_width_ = neighborhood_grid_radius_ * 2 + 1; - + top_channels_ = neighborhood_grid_width_; num = data1.shape_[0]; channels = data1.shape_[1]; diff --git a/src/operator/correlation1D.cc b/src/operator/correlation1D.cc index 50d1975e09be..869f432c17de 100644 --- a/src/operator/correlation1D.cc +++ b/src/operator/correlation1D.cc @@ -7,7 +7,7 @@ #include "./correlation1D-inl.h" #include "./mshadow_op.h" -namespace mshadow { +namespace mshadow { template inline void Correlation1DForward(const Tensor &out, const Tensor &data1, @@ -18,7 +18,7 @@ inline void Correlation1DForward(const Tensor &out, int pad_size_, int single_side, int max_displacement_, int kernel_size_, int neighborhood_grid_radius_, int neighborhood_grid_width_, - int kernel_radius_, int stride1_, int stride2_) + int kernel_radius_, int stride1_, int stride2_) { printf("No implementation"); } diff --git a/src/operator/correlation1D.cu b/src/operator/correlation1D.cu index cc6ee8aac89b..0e73469611fe 100644 --- a/src/operator/correlation1D.cu +++ b/src/operator/correlation1D.cu @@ -27,22 +27,22 @@ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ namespace mshadow { namespace cuda { // == correlation1D Kernel -template +template __global__ void Correlate1DData(const int nthreads, int num, int topwidth, int topheight, int topchannels, int topcount, int max_displacement, int x_shift, int neighborhood_grid_width, int kernel_radius, int kernel_size, int stride1, int stride2, int bottomwidth, int bottomheight, int bottomchannels, - const Dtype *bottom0, const Dtype *bottom1, Dtype *top) + const Dtype *bottom0, const Dtype *bottom1, Dtype *top) { extern __shared__ char patch_data_char[]; - + Dtype *patch_data = (Dtype *)patch_data_char; - + // First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1 int x1 = blockIdx.x*stride1 + max_displacement; int y1 = blockIdx.y*stride1; int item = blockIdx.z; int ch_off = threadIdx.x; - + // Load 3D patch into shared shared memory for(int j = 0; j < kernel_size; j++) { // HEIGHT for(int i = 0; i < kernel_size; i++) { // WIDTH @@ -54,33 +54,33 @@ __global__ void Correlate1DData(const int nthreads, int num, int topwidth, int t } } } - + __syncthreads(); - + __shared__ Dtype sum[WARPS_PER_BLOCK*THREADS_PER_WARP]; - - // Compute + + // Compute for(int top_channel = 0; top_channel < topchannels; top_channel++) { sum[ch_off] = 0; - + int s2o = (top_channel % neighborhood_grid_width + x_shift) * stride2; - + for(int j = 0; j < kernel_size; j++) { // HEIGHT for(int i = 0; i < kernel_size; i++) { // WIDTH int ji_off = ((j * kernel_size) + i) * bottomchannels; for(int ch = ch_off; ch < bottomchannels; ch += (WARPS_PER_BLOCK*THREADS_PER_WARP)) { // CHANNELS int x2 = x1 + s2o; - + int idxPatchData = ji_off + ch; int idx2 = ((item * bottomheight + y1+j) * bottomwidth + x2+i) * bottomchannels + ch; - + sum[ch_off] += patch_data[idxPatchData] * bottom1[idx2]; } } } - + __syncthreads(); - + if(ch_off == 0) { Dtype total_sum = 0; for(int idx = 0; idx < WARPS_PER_BLOCK*THREADS_PER_WARP; idx++) { @@ -90,8 +90,8 @@ __global__ void Correlate1DData(const int nthreads, int num, int topwidth, int t const int index = ((top_channel*topheight + blockIdx.y)*topwidth)+blockIdx.x; top[index + item*topcount] = total_sum / (float)sumelems; } - } - // Aggregate + } + // Aggregate } // == correlation1D Backward Pass Kernel (For data1) @@ -113,15 +113,15 @@ __global__ void Correlate1DDataBackward0(const int nthreads, int num, int item, // We use a large offset, for the inner part not to become negative. const int round_off = ROUND_OFF; const int round_off_s1 = stride1 * round_off; - + // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior: int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1 int ymin = (m - 2*kernel_radius - 0 + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1 - + // Same here: int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off; // floor (l - max_displacement) / stride1 int ymax = (m - 0 + round_off_s1) / stride1 - round_off; // floor (m - max_displacement) / stride1 - + Dtype sum = 0; if(xmax>=0 && ymax>=0 && (xmin<=topwidth-1) && (ymin<=topheight-1)) @@ -175,24 +175,24 @@ __global__ void Correlate1DDataBackward1(const int nthreads, int n = index % bottomchannels; //channels int l = (index / bottomchannels) % bottomwidth + pad_size; //w-pos int m = (index / bottomchannels / bottomwidth) % bottomheight; //h-pos - + // round_off is a trick to enable integer division with ceil, even for negative numbers // We use a large offset, for the inner part not to become negative. const int round_off = ROUND_OFF; const int round_off_s1 = stride1 * round_off; - + Dtype sum = 0; { - + for(int o = x_shift; o < x_shift + neighborhood_grid_width; o++) { - + int s2o = stride2 * o; - + //Get X,Y ranges and clamp // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior: int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1 int ymin = (m - 2*kernel_radius - 0 - 0 + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1 - + // Same here: int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off; // floor (l - max_displacement - s2o) / stride1 int ymax = (m - 0 - 0 + round_off_s1) / stride1 - round_off; // floor (m - max_displacement - 0) / stride1 @@ -249,7 +249,7 @@ int channels, int width, int height, int widthheight, int padding, int pwidthhei int ypad = (xy / width + 0); int xypad = ypad * (width+2*padding) + xpad; - out[(n*pwidthheight+xypad)*channels + ch] = value; + out[(n*pwidthheight+xypad)*channels + ch] = value; } template void Forward_gpu( @@ -287,10 +287,10 @@ void Forward_gpu( const int height = bheight; const int width = bwidth + 2 * pad_size_; const int shared_memory_per_block = (kernel_size_ * kernel_size_) * bchannels; - + int x_shift = - neighborhood_grid_radius_; if(single_side == -1) { // to the left - x_shift = -neighborhood_grid_width_; + x_shift = - neighborhood_grid_radius_; } else if(single_side == 1) { // to the right x_shift = 0; } @@ -335,10 +335,10 @@ void Backward_gpu( int botThreadCount = bottomcount; const int gridSize = (botThreadCount + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; // correlation1DLayerBackward - + int x_shift = - neighborhood_grid_radius_; if (single_side == -1) { // to the left - x_shift = -neighborhood_grid_width_; + x_shift = -neighborhood_grid_radius_; } else if(single_side == 1) { // to the right x_shift = 0; }