-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstencil_cuda.h
296 lines (238 loc) · 10.5 KB
/
stencil_cuda.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#include "utils_check_stencil.h"
template <typename _TYPE_>
__global__ void set_to_idx(_TYPE_ *array, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if ((idx >= 0) && (idx < N))
array[idx] = (_TYPE_)idx;
}
template <typename _TYPE_>
__global__ void set_to(_TYPE_ *array, _TYPE_ value, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if ((idx >= 0) && (idx < N))
array[idx] = value;
}
template <typename _TYPE_, int radius, bool use_buffer=true>
__global__ void stencil_cuda_kernel(_TYPE_ *input, _TYPE_ *output, int N)
{
// global index (spans from 0 to N)
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
// We only do the stencil reduction in cells that have ar least radius cells on their left/right
if ((idx >= radius) && (idx < N - radius))
{
//Use a buffer ! crucial for performance
_TYPE_ result = 0;
// stencil operation (sum over neighbors)
for (int i = -radius; i <= radius; i++)
{
if constexpr(use_buffer)
{
result += input[idx + i];
}else
{
output[idx] += input[idx + i];
}
}
if constexpr(use_buffer) output[idx] = result;
}
}
template <typename _TYPE_, int radius, bool use_buffer=true>
__global__ void stencil_cuda_shared_memory_kernel(_TYPE_ *input, _TYPE_ *output, int N)
{
// global index (spans from 0 to N)
const int gidx = blockIdx.x * blockDim.x + threadIdx.x;
// local index in the block (spans for 0 to blockDim.x)
const int idx_loc = threadIdx.x;
// local index offset by radius to match the shared memory array
const int idx_shared = threadIdx.x + radius;
// Allocation of the shared memory
__shared__ _TYPE_ shared[BLOCK_SIZE +2*radius];
// Bound check (we fill the shared memory on the whole array)
if ((gidx >= 0) && (gidx < N))
{
// Fill the shared memory
shared[idx_shared] = input[gidx];
// The left threads fills the halo of the shared memiry
if ((idx_loc < radius) && (gidx - radius >= 0))
{
shared[idx_shared - radius] = input[gidx - radius];
}
// The right thread fills the halo of the shared memory
if ((idx_shared >= blockDim.x) && (gidx + radius < N))
{
shared[idx_shared + radius] = input[gidx + radius];
}
// Memory needs to be synced
__syncthreads();
// Do the stencil operation
if ((gidx >= radius) && (gidx < N - radius))
{
//Use a buffer ! crucial for performance
_TYPE_ result = 0;
for (int i = -radius; i <= radius; i++)
{
if constexpr(use_buffer)
{
result += shared[idx_shared + i];
}else
{
output[gidx] += shared[idx_shared + i];
}
}
if constexpr(use_buffer) output[gidx] = result;
}
}
}
template <typename _TYPE_, int radius>
void stencil_cuda(const int MemSizeArraysMB, const int N_imposed = -1)
{
//Timers
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
std::cout<<"\n ||| CUDA KERNELS, dtype = "<< typeid(_TYPE_).name()<<" |||\n";
// If N_imposed is provided, we scale the problem accordingly
// then we also assume that the user wants to test the kernel.
// We set input[i]=i and check the validity of the result to detect any indexing error
// This fails however for large value of N because of round-off error
// In non-test mode, we simply impose input[i]=1 and check that output[i]=2*radius+1
// dimension problem
const bool test_mode = N_imposed > 0 ? true : false;
const int N = test_mode ? N_imposed : MB * MemSizeArraysMB / sizeof(_TYPE_);
const int dataSize = N * sizeof(_TYPE_);
//Operation and accesses counts
// number of operations, stencil operation on the vector
long operations = NREPEAT_KERNEL * static_cast<long>(N - 2 * radius) * (2 * radius + 1);
// number of memory accesses, assuming perfect caching, input is read, output is modified and read 3xN
long mem_accesses = NREPEAT_KERNEL * static_cast<long>(N - 2 * radius) * 2;
// Allocate GPU memory
_TYPE_ *input, *output, *h_output;
cudaMalloc((void **)&input, dataSize);
cudaMalloc((void **)&output, dataSize);
// Allocate CPU memory (for check)
h_output = (_TYPE_ *)malloc(dataSize);
// Dimension block and grid
int NBLOCKS = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
std::cout << "\nN, NBLOCKS, BLOCK_SIZE= " << N << " " << NBLOCKS << " " << BLOCK_SIZE << "\n";
// Shared memory size computation
const int shared_memory_size = (BLOCK_SIZE + 2 * radius) * sizeof(_TYPE_);
std::cout << "shared memory allocated per block = " << ((float)shared_memory_size) / KB << "KB \n";
// --------------------------------Verification stencil_cuda_kernel-------------------------------- //
// Initialize data
if (test_mode)
{
set_to_idx<_TYPE_><<<NBLOCKS, BLOCK_SIZE>>>(input, N);
}
else
{
set_to<_TYPE_><<<NBLOCKS, BLOCK_SIZE>>>(input, 1, N);
}
set_to<_TYPE_><<<NBLOCKS, BLOCK_SIZE>>>(output, 0, N);
// Check 1st kernel works
stencil_cuda_kernel<_TYPE_, radius><<<NBLOCKS, BLOCK_SIZE>>>(input, output, N);
cudaMemcpy(h_output, output, dataSize, cudaMemcpyDeviceToHost);
_TYPE_ err = 0;
for (int i = radius; i < N - radius; i++)
{
_TYPE_ sol_i = test_mode ? exact_result_stencil_kernel<_TYPE_, radius>(i) : (2 * radius + 1);
err += abs(h_output[i] - sol_i);
}
std::cout << "stencil_cuda_kernel error (must be 0)="<< err<<std::endl;
// --------------------------------Verification stencil_cuda_shared_memory_kernel -------------------------------- //
// re-initialize data
if (test_mode)
{
set_to_idx<_TYPE_><<<NBLOCKS, BLOCK_SIZE>>>(input, N);
}
else
{
set_to<_TYPE_><<<NBLOCKS, BLOCK_SIZE>>>(input, 1, N);
}
set_to<_TYPE_><<<NBLOCKS, BLOCK_SIZE>>>(output, 0, N);
// Check if we have enough shared memory to launch our configuration
// This is not check automatically and can lead to errors
// Get device properties and size problems accordingly
// cudaDeviceProp prop;
// cudaGetDeviceProperties(&prop, 0);
// const int n_simultaneous_blocs_per_sm = prop.maxThreadsPerMultiProcessor / BLOCK_SIZE;
// const int total_simultaneous_shared_mem_requested_per_sm = n_simultaneous_blocs_per_sm * shared_memory_size;
// if (total_simultaneous_shared_mem_requested_per_sm > prop.sharedMemPerMultiprocessor)
// {
// std::cout << "Warning: Total shared memory requested per SM " << total_simultaneous_shared_mem_requested_per_sm / KB << "KB exceeds the maximum shared memory available per SM " << prop.sharedMemPerMultiprocessor / KB << " KB" << std::endl;
// }
// if (shared_memory_size > prop.sharedMemPerBlock)
// {
// std::cout << "Error: Shared memory allocation exceeds the limit. Reduce radius or block size." << std::endl;
// }
// End of check
// Check 2n kernel works
stencil_cuda_shared_memory_kernel<_TYPE_, radius><<<NBLOCKS, BLOCK_SIZE, shared_memory_size>>>(input, output, N);
cudaMemcpy(h_output, output, dataSize, cudaMemcpyDeviceToHost);
_TYPE_ err_shared = 0;
for (int i = radius + 1; i < N - radius; i++)
{
_TYPE_ sol_i = test_mode ? exact_result_stencil_kernel<_TYPE_, radius>(i) : (2 * radius + 1);
err_shared += abs(h_output[i] - sol_i);
}
std::cout << "stencil_cuda_shared_memory_kernel error (must be 0)="<<err_shared<<std::endl;
// --------------------------------Timing stencil_cuda_kernel-------------------------------- //
// Measure 1st kernel execution time
cudaEventRecord(start);
for (size_t n = 0; n < NREPEAT_KERNEL; n++)
{
stencil_cuda_kernel<_TYPE_, radius><<<NBLOCKS, BLOCK_SIZE>>>(input, output, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms;
cudaEventElapsedTime(&ms, start, stop);
float tflops = operations / (ms / 1000.0f) / 1e12;
float bw = sizeof(_TYPE_) * mem_accesses / (ms / 1000.0f) / GB;
print_perf<_TYPE_>(operations, mem_accesses, ms, "** stencil_cuda_kernel **");
// --------------------------------Timing stencil_cuda_kernel with no buffer-------------------------------- //
cudaEventRecord(start);
for (size_t n = 0; n < NREPEAT_KERNEL; n++)
{
stencil_cuda_kernel<_TYPE_, radius, false><<<NBLOCKS, BLOCK_SIZE>>>(input, output, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
tflops = operations / (ms / 1000.0f) / 1e12;
bw = sizeof(_TYPE_) * mem_accesses / (ms / 1000.0f) / GB;
print_perf<_TYPE_>(operations, mem_accesses, ms, "** stencil_cuda_kernel, no buffer **");
// --------------------------------Timing stencil_cuda_shared_memory_kernel-------------------------------- //
// Measure 2nd kernel execution time
cudaEventRecord(start);
for (size_t n = 0; n < NREPEAT_KERNEL; n++)
{
stencil_cuda_shared_memory_kernel<_TYPE_, radius><<<NBLOCKS, BLOCK_SIZE>>>(input, output, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms_shared;
cudaEventElapsedTime(&ms_shared, start, stop);
float tflops_shared = operations / (ms_shared / 1000.0f) / 1e12;
float bw_shared = sizeof(_TYPE_) * mem_accesses / (ms_shared / 1000.0f) / GB;
print_perf<_TYPE_>(operations, mem_accesses, ms_shared, "** stencil_cuda_shared_memory_kernel **");
// --------------------------------Timing stencil_cuda_shared_memory_kernel with no buffer-------------------------------- //
cudaEventRecord(start);
for (size_t n = 0; n < NREPEAT_KERNEL; n++)
{
stencil_cuda_shared_memory_kernel<_TYPE_, radius, false><<<NBLOCKS, BLOCK_SIZE>>>(input, output, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms_shared, start, stop);
tflops_shared = operations / (ms_shared / 1000.0f) / 1e12;
bw_shared = sizeof(_TYPE_) * mem_accesses / (ms_shared / 1000.0f) / GB;
print_perf<_TYPE_>(operations, mem_accesses, ms_shared, "** stencil_cuda_shared_memory_kernel, no buffer **");
// Cleanup
cudaFree(input);
cudaFree(output);
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(h_output);
return;
}