Skip to content
This repository has been archived by the owner on Jun 27, 2022. It is now read-only.

Commit

Permalink
fix correlation1D illegal cuda memory bug
Browse files Browse the repository at this point in the history
  • Loading branch information
StOnEGiggity committed Dec 11, 2018
1 parent 1c561f7 commit 9bc2a7c
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 37 deletions.
8 changes: 4 additions & 4 deletions src/operator/correlation1D-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* Copyright (c) 2015 by Contributors
* \file correlation1D-inl.h
* \brief correlation1D operator and symbol
* \author Xu Dong
* \author Xu Dong
*/
#ifndef MXNET_OPERATOR_CORRELATION1D_INL_H_
#define MXNET_OPERATOR_CORRELATION1D_INL_H_
Expand Down Expand Up @@ -73,10 +73,10 @@ class Correlation1DOp : public Operator {
CHECK_EQ(out.CheckContiguous(), true);
CHECK_EQ(tmp1.CheckContiguous(), true);
CHECK_EQ(tmp2.CheckContiguous(), true);

paddedbottomheight = data1.shape_[2];
paddedbottomwidth = data1.shape_[3] + 2 * param_.pad_size;

kernel_radius_ = (param_.kernel_size - 1) / 2;
border_size_ = param_.max_displacement + kernel_radius_;
stride1 = param_.stride1;
Expand All @@ -90,7 +90,7 @@ class Correlation1DOp : public Operator {
neighborhood_grid_width_ = neighborhood_grid_radius_ + 1;
else
neighborhood_grid_width_ = neighborhood_grid_radius_ * 2 + 1;

top_channels_ = neighborhood_grid_width_;
num = data1.shape_[0];
channels = data1.shape_[1];
Expand Down
4 changes: 2 additions & 2 deletions src/operator/correlation1D.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "./correlation1D-inl.h"
#include "./mshadow_op.h"

namespace mshadow {
namespace mshadow {
template<typename Dtype>
inline void Correlation1DForward(const Tensor<cpu, 4, Dtype> &out,
const Tensor<cpu, 4, Dtype> &data1,
Expand All @@ -18,7 +18,7 @@ inline void Correlation1DForward(const Tensor<cpu, 4, Dtype> &out,
int pad_size_, int single_side,
int max_displacement_, int kernel_size_,
int neighborhood_grid_radius_, int neighborhood_grid_width_,
int kernel_radius_, int stride1_, int stride2_)
int kernel_radius_, int stride1_, int stride2_)
{
printf("No implementation");
}
Expand Down
62 changes: 31 additions & 31 deletions src/operator/correlation1D.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,22 @@ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
namespace mshadow {
namespace cuda {
// == correlation1D Kernel
template <typename Dtype>
template <typename Dtype>
__global__ void Correlate1DData(const int nthreads, int num, int topwidth, int topheight, int topchannels, int topcount,
int max_displacement, int x_shift, int neighborhood_grid_width, int kernel_radius, int kernel_size, int stride1, int stride2,
int bottomwidth, int bottomheight, int bottomchannels,
const Dtype *bottom0, const Dtype *bottom1, Dtype *top)
const Dtype *bottom0, const Dtype *bottom1, Dtype *top)
{
extern __shared__ char patch_data_char[];

Dtype *patch_data = (Dtype *)patch_data_char;

// First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1
int x1 = blockIdx.x*stride1 + max_displacement;
int y1 = blockIdx.y*stride1;
int item = blockIdx.z;
int ch_off = threadIdx.x;

// Load 3D patch into shared shared memory
for(int j = 0; j < kernel_size; j++) { // HEIGHT
for(int i = 0; i < kernel_size; i++) { // WIDTH
Expand All @@ -54,33 +54,33 @@ __global__ void Correlate1DData(const int nthreads, int num, int topwidth, int t
}
}
}

__syncthreads();

__shared__ Dtype sum[WARPS_PER_BLOCK*THREADS_PER_WARP];
// Compute

// Compute
for(int top_channel = 0; top_channel < topchannels; top_channel++) {
sum[ch_off] = 0;

int s2o = (top_channel % neighborhood_grid_width + x_shift) * stride2;

for(int j = 0; j < kernel_size; j++) { // HEIGHT
for(int i = 0; i < kernel_size; i++) { // WIDTH
int ji_off = ((j * kernel_size) + i) * bottomchannels;
for(int ch = ch_off; ch < bottomchannels; ch += (WARPS_PER_BLOCK*THREADS_PER_WARP)) { // CHANNELS
int x2 = x1 + s2o;

int idxPatchData = ji_off + ch;
int idx2 = ((item * bottomheight + y1+j) * bottomwidth + x2+i) * bottomchannels + ch;

sum[ch_off] += patch_data[idxPatchData] * bottom1[idx2];
}
}
}

__syncthreads();

if(ch_off == 0) {
Dtype total_sum = 0;
for(int idx = 0; idx < WARPS_PER_BLOCK*THREADS_PER_WARP; idx++) {
Expand All @@ -90,8 +90,8 @@ __global__ void Correlate1DData(const int nthreads, int num, int topwidth, int t
const int index = ((top_channel*topheight + blockIdx.y)*topwidth)+blockIdx.x;
top[index + item*topcount] = total_sum / (float)sumelems;
}
}
// Aggregate
}
// Aggregate
}

// == correlation1D Backward Pass Kernel (For data1)
Expand All @@ -113,15 +113,15 @@ __global__ void Correlate1DDataBackward0(const int nthreads, int num, int item,
// We use a large offset, for the inner part not to become negative.
const int round_off = ROUND_OFF;
const int round_off_s1 = stride1 * round_off;

// We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1
int ymin = (m - 2*kernel_radius - 0 + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1

// Same here:
int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off; // floor (l - max_displacement) / stride1
int ymax = (m - 0 + round_off_s1) / stride1 - round_off; // floor (m - max_displacement) / stride1


Dtype sum = 0;
if(xmax>=0 && ymax>=0 && (xmin<=topwidth-1) && (ymin<=topheight-1))
Expand Down Expand Up @@ -175,24 +175,24 @@ __global__ void Correlate1DDataBackward1(const int nthreads,
int n = index % bottomchannels; //channels
int l = (index / bottomchannels) % bottomwidth + pad_size; //w-pos
int m = (index / bottomchannels / bottomwidth) % bottomheight; //h-pos

// round_off is a trick to enable integer division with ceil, even for negative numbers
// We use a large offset, for the inner part not to become negative.
const int round_off = ROUND_OFF;
const int round_off_s1 = stride1 * round_off;

Dtype sum = 0;
{

for(int o = x_shift; o < x_shift + neighborhood_grid_width; o++) {

int s2o = stride2 * o;

//Get X,Y ranges and clamp
// We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
int ymin = (m - 2*kernel_radius - 0 - 0 + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1

// Same here:
int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off; // floor (l - max_displacement - s2o) / stride1
int ymax = (m - 0 - 0 + round_off_s1) / stride1 - round_off; // floor (m - max_displacement - 0) / stride1
Expand Down Expand Up @@ -249,7 +249,7 @@ int channels, int width, int height, int widthheight, int padding, int pwidthhei
int ypad = (xy / width + 0);
int xypad = ypad * (width+2*padding) + xpad;

out[(n*pwidthheight+xypad)*channels + ch] = value;
out[(n*pwidthheight+xypad)*channels + ch] = value;
}
template <typename Dtype>
void Forward_gpu(
Expand Down Expand Up @@ -287,10 +287,10 @@ void Forward_gpu(
const int height = bheight;
const int width = bwidth + 2 * pad_size_;
const int shared_memory_per_block = (kernel_size_ * kernel_size_) * bchannels;

int x_shift = - neighborhood_grid_radius_;
if(single_side == -1) { // to the left
x_shift = -neighborhood_grid_width_;
x_shift = - neighborhood_grid_radius_;
} else if(single_side == 1) { // to the right
x_shift = 0;
}
Expand Down Expand Up @@ -335,10 +335,10 @@ void Backward_gpu(
int botThreadCount = bottomcount;
const int gridSize = (botThreadCount + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
// correlation1DLayerBackward

int x_shift = - neighborhood_grid_radius_;
if (single_side == -1) { // to the left
x_shift = -neighborhood_grid_width_;
x_shift = -neighborhood_grid_radius_;
} else if(single_side == 1) { // to the right
x_shift = 0;
}
Expand Down

0 comments on commit 9bc2a7c

Please sign in to comment.