-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmain.cu
490 lines (430 loc) · 15.2 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
/*
* Authors:
* Oded Green ([email protected]), Rob McColl ([email protected])
* High Performance Computing Lab, Georgia Tech
*
* Future Publication:
* GPU MergePath: A GPU Merging Algorithm
* ACM International Conference on Supercomputing 2012
* June 25-29 2012, San Servolo, Venice, Italy
*
* Copyright (c) 2012 Georgia Institute of Technology
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the Georgia Institute of Technology nor the names of
* its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <limits.h>
#include <stdlib.h>
#include <float.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/fill.h>
#include <thrust/generate.h>
#include <thrust/merge.h>
#define CSV 0
#if(CSV)
#define PS(X, S) std::cout << X << ", " << S << ", "; fflush(stdout);
#define PV(X) std::cout << X << ", "; fflush(stdout);
#else
#define PS(X, S) std::cout << X << " " << S <<" :\n"; fflush(stdout);
#define PV(X) std::cout << "\t" << #X << " \t: " << X << "\n"; fflush(stdout);
#endif
/* GLOBAL FUNCTION DECLARATIONS */
template<typename vec_t>
__global__ void cudaWorkloadDiagonals(vec_t * A, uint32_t A_length, vec_t * B, uint32_t B_length,
uint32_t * diagonal_path_intersections);
template<typename vec_t, bool timesections, bool countloops>
__global__ void cudaMergeSinglePath(vec_t * A, uint32_t A_length, vec_t * B, uint32_t B_length,
uint32_t * diagonal_path_intersections, vec_t * C, uint32_t C_length,
float * times, uint32_t * loopCount);
/* POSITIVEINFINITY
* Returns maximum value of a type
*/
template<typename vec_t>
__host__ __device__ vec_t getPositiveInfinity() {
vec_t tmp = 0;
return positiveInfinity(tmp);
}
__host__ __device__ float positiveInfinity(float tmp) {
return FLT_MAX;
}
__host__ __device__ double positiveInfinity(double tmp) {
return DBL_MAX;
}
__host__ __device__ uint32_t positiveInfinity(uint32_t tmp) {
return 0xFFFFFFFFUL;
}
__host__ __device__ uint64_t positiveInfinity(uint64_t tmp) {
return 0xFFFFFFFFFFFFFFFFUL;
}
/* NEGATIVEINFINITY
* Returns minimum value of a type
*/
template<typename vec_t>
__host__ __device__ vec_t getNegativeInfinity() {
vec_t tmp = 0;
return negativeInfinity(tmp);
}
__host__ __device__ float negativeInfinity(float tmp) {
return FLT_MIN;
}
__host__ __device__ double negativeInfinity(double tmp) {
return DBL_MIN;
}
__host__ __device__ uint32_t negativeInfinity(uint32_t tmp) {
return 0;
}
__host__ __device__ uint64_t negativeInfinity(uint64_t tmp) {
return 0;
}
/* RAND64
* Gives up to 64-bits of pseudo-randomness
* Note: not very "good" or "random"
*/
template<typename vec_t>
vec_t rand64() {
vec_t rtn;
do {
uint32_t * rtn32 = (uint32_t *)&rtn;
rtn32[0] = rand();
if(sizeof(vec_t) > 4) rtn32[1] = rand();
} while(!(rtn < getPositiveInfinity<vec_t>() &&
rtn > getNegativeInfinity<vec_t>()));
return rtn;
}
/* MERGETYPE
* Performs <runs> merges of two sorted pseudorandom <vec_t> arrays of length <size>
* Times the runs and reports on the average time
* Checks the output of each merge for correctness
*/
#define PADDING 1024
template<typename vec_t, uint32_t blocks, uint32_t threads, uint32_t runs>
void mergeType(uint64_t size) {
// Prepare host and device vectors
thrust::host_vector<vec_t>hostA(size + (PADDING));
thrust::host_vector<vec_t>hostB(size + (PADDING));
thrust::host_vector<vec_t>hostC(2*size + (PADDING));
thrust::device_vector<vec_t>A;
thrust::device_vector<vec_t>B;
thrust::device_vector<vec_t>C(2*size + (PADDING));
thrust::device_vector<uint32_t> diagonal_path_intersections(2 * (blocks + 1));
float diag = 0;
float merge = 0;
uint32_t errors = 0;
// Fore each run
for(uint32_t i = 0; i < runs; i++) {
// Generate two sorted psuedorandom arrays
thrust::generate(hostA.begin(), hostA.end(), rand64<vec_t>);
thrust::generate(hostB.begin(), hostB.end(), rand64<vec_t>);
thrust::fill(hostA.begin() + size, hostA.end(), getPositiveInfinity<vec_t>());
thrust::fill(hostB.begin() + size, hostB.end(), getPositiveInfinity<vec_t>());
A = hostA;
B = hostB;
thrust::sort(A.begin(), A.end());
thrust::sort(B.begin(), B.end());
// Perform the global diagonal intersection serach to divide work among SMs
float temp;
{
cudaEvent_t start_event, stop_event;
cudaEventCreate(&start_event);
cudaEventCreate(&stop_event);
cudaEventRecord(start_event, 0);
cudaWorkloadDiagonals<vec_t><<<blocks, 32>>>
(A.data().get(), size, B.data().get(),
size, diagonal_path_intersections.data().get());
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&temp, start_event, stop_event);
diag += temp;
}
// Merge between global diagonals independently on each block
{
cudaEvent_t start_event, stop_event;
cudaEventCreate(&start_event);
cudaEventCreate(&stop_event);
cudaEventRecord(start_event, 0);
cudaMergeSinglePath<vec_t,false,false><<<blocks, threads>>>
(A.data().get(), size, B.data().get(), size, diagonal_path_intersections.data().get(),
C.data().get(), size * 2, NULL, NULL);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&temp, start_event, stop_event);
merge += temp;
}
// Test for errors
hostC = C;
for(uint32_t i = 1; i < size; i++) {
errors += hostC[i] < hostC[i-1];
}
}
// Print timing results
diag /= runs;
merge /= runs;
float total = diag + merge;
PV(diag);
PV(merge);
PV(total);
PV(errors);
}
/* MERGEALLTYPES
* Performs <runs> merge tests for each type at a given size
*/
template<uint32_t blocks, uint32_t threads, uint32_t runs>
void mergeAllTypes(uint64_t size) {
PS("uint32_t", size) mergeType<uint32_t, blocks, threads, runs>(size); printf("\n");
PS("float", size) mergeType<float, blocks, threads, runs>(size); printf("\n");
PS("uint64_t", size) mergeType<uint64_t, blocks, threads, runs>(size); printf("\n");
PS("double", size) mergeType<double, blocks, threads, runs>(size); printf("\n");
}
/* MAIN
* Generates random arrays, merges them.
*/
int main(int argc, char *argv[]) {
#define blocks 112
#define threads 128
#define runs 10
mergeAllTypes<blocks, threads, runs>(1000000);
mergeAllTypes<blocks, threads, runs>(10000000);
mergeAllTypes<blocks, threads, runs>(100000000);
}
/* CUDAWORKLOADDIAGONALS
* Performs a 32-wide binary search on one glboal diagonal per block to find the intersection with the path.
* This divides the workload into independent merges for the next step
*/
#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y))
#define MIN(X,Y) (((X) < (Y)) ? (X) : (Y))
template<typename vec_t>
__global__ void cudaWorkloadDiagonals(vec_t * A, uint32_t A_length, vec_t * B, uint32_t B_length,
uint32_t * diagonal_path_intersections) {
// Calculate combined index around the MergePath "matrix"
int32_t combinedIndex = (uint64_t)blockIdx.x * ((uint64_t)A_length + (uint64_t)B_length) / (uint64_t)gridDim.x;
__shared__ int32_t x_top, y_top, x_bottom, y_bottom, found;
__shared__ int32_t oneorzero[32];
int threadOffset = threadIdx.x - 16;
// Figure out the coordinates of our diagonal
x_top = MIN(combinedIndex, A_length);
y_top = combinedIndex > (A_length) ? combinedIndex - (A_length) : 0;
x_bottom = y_top;
y_bottom = x_top;
found = 0;
// Search the diagonal
while(!found) {
// Update our coordinates within the 32-wide section of the diagonal
int32_t current_x = x_top - ((x_top - x_bottom) >> 1) - threadOffset;
int32_t current_y = y_top + ((y_bottom - y_top) >> 1) + threadOffset;
// Are we a '1' or '0' with respect to A[x] <= B[x]
if(current_x >= A_length || current_y < 0) {
oneorzero[threadIdx.x] = 0;
} else if(current_y >= B_length || current_x < 1) {
oneorzero[threadIdx.x] = 1;
} else {
oneorzero[threadIdx.x] = (A[current_x-1] <= B[current_y]) ? 1 : 0;
}
__syncthreads();
// If we find the meeting of the '1's and '0's, we found the
// intersection of the path and diagonal
if(threadIdx.x > 0 && (oneorzero[threadIdx.x] != oneorzero[threadIdx.x-1])) {
found = 1;
diagonal_path_intersections[blockIdx.x] = current_x;
diagonal_path_intersections[blockIdx.x + gridDim.x + 1] = current_y;
}
__syncthreads();
// Adjust the search window on the diagonal
if(threadIdx.x == 16) {
if(oneorzero[31] != 0) {
x_bottom = current_x;
y_bottom = current_y;
} else {
x_top = current_x;
y_top = current_y;
}
}
__syncthreads();
}
// Set the boundary diagonals (through 0,0 and A_length,B_length)
if(threadIdx.x == 0 && blockIdx.x == 0) {
diagonal_path_intersections[0] = 0;
diagonal_path_intersections[gridDim.x + 1] = 0;
diagonal_path_intersections[gridDim.x] = A_length;
diagonal_path_intersections[gridDim.x + gridDim.x + 1] = B_length;
}
}
/* CUDAMERGESINGLEPATH
* Performs merge windows within a thread block from that block's global diagonal
* intersection to the next
*/
#define K 512
template<typename vec_t, bool timesections, bool countloops>
__global__ void cudaMergeSinglePath(vec_t * A, uint32_t A_length, vec_t * B, uint32_t B_length,
uint32_t * diagonal_path_intersections, vec_t * C, uint32_t C_length,
float * times, uint32_t * loopCount) {
// Setup timers
clock_t temp, memread = 0, cshared = 0, cglobal = 0;
__shared__ clock_t search, update;
search = 0;
update = 0;
clock_t init;
if(timesections) {
init = clock();
}
// Storage space for local merge window
__shared__ vec_t A_shared[K+2 << 1];
vec_t* B_shared = A_shared + K+2;
__shared__ uint32_t x_block_top, y_block_top, x_block_stop, y_block_stop;
// Pre-calculate reused indices
uint32_t threadIdX4 = threadIdx.x + threadIdx.x;
threadIdX4 = threadIdX4 + threadIdX4;
uint32_t threadIdX4p1 = threadIdX4 + 1;
uint32_t threadIdX4p2 = threadIdX4p1 + 1;
uint32_t threadIdX4p3 = threadIdX4p2 + 1;
uint32_t Ax, Bx;
// Define global window and create sentinels
switch(threadIdx.x) {
case 0:
x_block_top = diagonal_path_intersections[blockIdx.x];
A_shared[0] = getNegativeInfinity<vec_t>();
break;
case 64:
y_block_top = diagonal_path_intersections[blockIdx.x + gridDim.x + 1];
A_shared[K+1] = getPositiveInfinity<vec_t>();
break;
case 32:
x_block_stop = diagonal_path_intersections[blockIdx.x + 1];
B_shared[0] = getNegativeInfinity<vec_t>();
break;
case 96:
y_block_stop = diagonal_path_intersections[blockIdx.x + gridDim.x + 2];
B_shared[K+1] = getPositiveInfinity<vec_t>();
break;
default:
break;
}
A--;
B--;
__syncthreads();
if(timesections) {
init = clock() - init;
}
if(countloops) {
if(threadIdx.x == 0) loopCount[blockIdx.x] = 0;
}
// Construct and merge windows from diagonal_path_intersections[blockIdx.x]
// to diagonal_path_intersections[blockIdx.x+1]
while(((x_block_top < x_block_stop) || (y_block_top < y_block_stop))) {
if(countloops) {
if(threadIdx.x == 0) loopCount[blockIdx.x]++;
}
if(timesections) {
temp = clock();
}
// Load current local window
{
vec_t * Atemp = A + x_block_top;
vec_t * Btemp = B + y_block_top;
uint32_t sharedX = threadIdx.x+1;
A_shared[sharedX] = Atemp[sharedX];
B_shared[sharedX] = Btemp[sharedX];
sharedX += blockDim.x;
A_shared[sharedX] = Atemp[sharedX];
B_shared[sharedX] = Btemp[sharedX];
sharedX += blockDim.x;
A_shared[sharedX] = Atemp[sharedX];
B_shared[sharedX] = Btemp[sharedX];
sharedX += blockDim.x;
A_shared[sharedX] = Atemp[sharedX];
B_shared[sharedX] = Btemp[sharedX];
}
// Make sure this is before the sync
vec_t *Ctemp = C + x_block_top + y_block_top;
__syncthreads();
if(timesections) {
memread += clock() - temp;
temp = clock();
}
// Binary search diagonal in the local window for path
{
int32_t offset = threadIdX4 >> 1;
Ax = offset + 1;
vec_t * BSm1 = B_shared + threadIdX4p2;
vec_t * BS = BSm1 + 1;
while(true) {
offset = ((offset+1) >> 1);
if(A_shared[Ax] > BSm1[~Ax]) {
if(A_shared[Ax-1] <= BS[~Ax]) {
//Found it
break;
}
Ax -= offset;
} else {
Ax += offset;
}
}
}
Bx = threadIdX4p2 - Ax;
if(timesections) {
if(threadIdx.x == 127) search += clock() - temp;
temp = clock();
}
// Merge four elements starting at the found path intersection
vec_t Ai, Bi, Ci;
Ai = A_shared[Ax];
Bi = B_shared[Bx];
if(Ai > Bi) {Ci = Bi; Bx++; Bi = B_shared[Bx];} else {Ci = Ai; Ax++; Ai = A_shared[Ax];}
Ctemp[threadIdX4] = Ci;
if(Ai > Bi) {Ci = Bi; Bx++; Bi = B_shared[Bx];} else {Ci = Ai; Ax++; Ai = A_shared[Ax];}
Ctemp[threadIdX4p1] = Ci;
if(Ai > Bi) {Ci = Bi; Bx++; Bi = B_shared[Bx];} else {Ci = Ai; Ax++; Ai = A_shared[Ax];}
Ctemp[threadIdX4p2] = Ci;
Ctemp[threadIdX4p3] = Ai > Bi ? Bi : Ai;
if(timesections) {
if(threadIdx.x == 0) cglobal += clock() - temp;
temp = clock();
}
// Update for next window
if(threadIdx.x == 127) {
x_block_top += Ax - 1;
y_block_top += Bx - 1;
}
if(timesections) {
if(threadIdx.x == 127) update += clock() - temp;
}
__syncthreads();
} // Go to next window
if(timesections) {
float total = memread + search + cshared + cglobal + update;
if(threadIdx.x == 0) {
times[blockIdx.x] = memread/total;
times[blockIdx.x + blockDim.x] = search/total;
times[blockIdx.x + 2*blockDim.x] = cshared/total;
times[blockIdx.x + 3*blockDim.x] = cglobal/total;
times[blockIdx.x + 4*blockDim.x] = update/total;
}
}
}