diff --git a/include/caffe/layers/crop_layer.hpp b/include/caffe/layers/crop_layer.hpp index c4fda1220c3..5219fa5cb5f 100644 --- a/include/caffe/layers/crop_layer.hpp +++ b/include/caffe/layers/crop_layer.hpp @@ -41,13 +41,15 @@ class CropLayer : public Layer { virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - vector offsets; + Blob offsets; + Blob src_strides_; + Blob dest_strides_; private: // Recursive copy function. void crop_copy(const vector*>& bottom, const vector*>& top, - const vector& offsets, + const int* offsets, vector indices, int cur_dim, const Dtype* src_data, diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp index ef8c177c4dd..65ea8f8b7d0 100644 --- a/src/caffe/layers/crop_layer.cpp +++ b/src/caffe/layers/crop_layer.cpp @@ -40,8 +40,10 @@ void CropLayer::Reshape(const vector*>& bottom, const int start_axis = bottom[0]->CanonicalAxisIndex(param.axis()); // Initialize offsets to 0 and the new shape to the current shape of the data. - offsets = vector(input_dim, 0); vector new_shape(bottom[0]->shape()); + vector offsets_shape(1, input_dim); + offsets.Reshape(offsets_shape); + int* offset_data = offsets.mutable_cpu_data(); // Determine crop offsets and the new shape post-crop. for (int i = 0; i < input_dim; ++i) { @@ -63,15 +65,22 @@ void CropLayer::Reshape(const vector*>& bottom, << "size " << bottom[1]->shape(i) << " and offset " << crop_offset; } new_shape[i] = new_size; - offsets[i] = crop_offset; + offset_data[i] = crop_offset; } top[0]->Reshape(new_shape); + // Compute strides + src_strides_.Reshape(offsets_shape); + dest_strides_.Reshape(offsets_shape); + for (int i = 0; i < input_dim; ++i) { + src_strides_.mutable_cpu_data()[i] = bottom[0]->count(i + 1, input_dim); + dest_strides_.mutable_cpu_data()[i] = top[0]->count(i + 1, input_dim); + } } template void CropLayer::crop_copy(const vector*>& bottom, const vector*>& top, - const vector& offsets, + const int* offsets, vector indices, int cur_dim, const Dtype* src_data, @@ -115,7 +124,8 @@ void CropLayer::Forward_cpu(const vector*>& bottom, std::vector indices(top[0]->num_axes(), 0); const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); - crop_copy(bottom, top, offsets, indices, 0, bottom_data, top_data, true); + crop_copy(bottom, top, offsets.cpu_data(), indices, 0, bottom_data, top_data, + true); } template @@ -127,7 +137,8 @@ void CropLayer::Backward_cpu(const vector*>& top, if (propagate_down[0]) { caffe_set(bottom[0]->count(), static_cast(0), bottom_diff); std::vector indices(top[0]->num_axes(), 0); - crop_copy(bottom, top, offsets, indices, 0, top_diff, bottom_diff, false); + crop_copy(bottom, top, offsets.cpu_data(), indices, 0, top_diff, + bottom_diff, false); } } diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu index 677077cdd8b..a400f333e14 100644 --- a/src/caffe/layers/crop_layer.cu +++ b/src/caffe/layers/crop_layer.cu @@ -4,90 +4,62 @@ namespace caffe { -// Copy (one line per thread) from one array to another, with arbitrary -// strides in the last two dimensions. +__device__ int compute_uncropped_index( + int index, + const int ndims, + const int* src_strides, + const int* dest_strides, + const int* offsets) { + int dest_index = index; + int src_index = 0; + for (int i = 0; i < ndims; ++i) { + int coord = dest_index / dest_strides[i]; + dest_index -= coord * dest_strides[i]; + src_index += src_strides[i] * (coord + offsets[i]); + } + return src_index; +} + template -__global__ void copy_kernel(const int n, const int height, const int width, - const int src_inner_stride, - const int dest_inner_stride, +__global__ void crop_kernel_forward(const int nthreads, + const int ndims, + const int* src_strides, + const int* dest_strides, + const int* offsets, const Dtype* src, Dtype* dest) { - CUDA_KERNEL_LOOP(index, n) { - int src_start = index * src_inner_stride; - int dest_start = index * dest_inner_stride; - for (int i = 0; i < width; ++i) { - dest[dest_start + i] = src[src_start + i]; - } + CUDA_KERNEL_LOOP(index, nthreads) { + int src_index = compute_uncropped_index( + index, ndims, src_strides, dest_strides, offsets); + dest[index] = src[src_index]; } } template -void CropLayer::crop_copy_gpu(const vector*>& bottom, - const vector*>& top, - const vector& offsets, - vector indices, - int cur_dim, - const Dtype* src_data, - Dtype* dest_data, - bool is_forward) { - if (cur_dim + 2 < top[0]->num_axes()) { - // We are not yet at the final dimension, call copy recursivley - for (int i = 0; i < top[0]->shape(cur_dim); ++i) { - indices[cur_dim] = i; - crop_copy_gpu(bottom, top, offsets, indices, cur_dim+1, - src_data, dest_data, is_forward); - } - } else { - // We are at the last two dimensions, which are stored continuously in - // memory. With (N,C,H,W) - // (0,1,2,3) cur_dim -> H - // cur_dim+1 -> W - const int lines = top[0]->shape(cur_dim); - const int height = top[0]->shape(cur_dim); - const int width = top[0]->shape(cur_dim+1); - std::vector ind_off(cur_dim+2, 0); - for (int j = 0; j < cur_dim; ++j) { - ind_off[j] = indices[j] + offsets[j]; - } - ind_off[cur_dim] = offsets[cur_dim]; - ind_off[cur_dim+1] = offsets[cur_dim+1]; - // Compute copy strides - const int src_inner_stride = bottom[0]->shape(cur_dim+1); - const int dest_inner_stride = top[0]->shape(cur_dim+1); - - if (is_forward) { - const Dtype* bottom_data = bottom[0]->gpu_data() + - bottom[0]->offset(ind_off); - Dtype* top_data = top[0]->mutable_gpu_data() + - top[0]->offset(indices); - // NOLINT_NEXT_LINE(whitespace/operators) - copy_kernel<<>>( - lines, height, width, - src_inner_stride, - dest_inner_stride, - bottom_data, top_data); - - } else { - const Dtype* top_diff = top[0]->gpu_diff() + - top[0]->offset(indices); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff() + - bottom[0]->offset(ind_off); - // NOLINT_NEXT_LINE(whitespace/operators) - copy_kernel<<>>( - lines, height, width, - dest_inner_stride, - src_inner_stride, - top_diff, bottom_diff); - } +__global__ void crop_kernel_backward(const int nthreads, + const int ndims, + const int* src_strides, + const int* dest_strides, + const int* offsets, + Dtype* src, const Dtype* dest) { + CUDA_KERNEL_LOOP(index, nthreads) { + int src_index = compute_uncropped_index( + index, ndims, src_strides, dest_strides, offsets); + src[src_index] = dest[index]; } } template void CropLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { - std::vector indices(top[0]->num_axes(), 0); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - crop_copy_gpu(bottom, top, offsets, indices, 0, bottom_data, top_data, true); + int n = top[0]->count(); + crop_kernel_forward<<>>(n, + bottom[0]->num_axes(), + src_strides_.gpu_data(), + dest_strides_.gpu_data(), + offsets.gpu_data(), + bottom_data, top_data); } template @@ -95,12 +67,16 @@ void CropLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + int n = top[0]->count(); if (propagate_down[0]) { caffe_gpu_set(bottom[0]->count(), static_cast(0), bottom_diff); - std::vector indices(top[0]->num_axes(), 0); - crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff, - false); + crop_kernel_backward<<>>(n, + bottom[0]->num_axes(), + src_strides_.gpu_data(), + dest_strides_.gpu_data(), + offsets.gpu_data(), + bottom_diff, top_diff); } }