-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcublas_beamformer.cu
412 lines (334 loc) · 12.9 KB
/
cublas_beamformer.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#include <stdio.h>
#include <stdlib.h>
#include <cstdlib>
#include <curand.h>
#include <assert.h>
#include <unistd.h>
#include <cublas_v2.h>
#include <iostream>
#include <complex.h>
#include <math.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include "cublas_beamformer.h"
using namespace std;
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
void GPU_fill(cuComplex *A, int nr_rows_A, int nr_cols_A) {
cuComplex *G;
G = new cuComplex[nr_rows_A*nr_cols_A];
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
G[i].x = (i + 1)%(nr_rows_A*nr_cols_A/(N_BIN));
G[i].y = (i + 1)%(nr_rows_A*nr_cols_A/(N_BIN));
}
cudaMemcpy(A,G,nr_rows_A * nr_cols_A * sizeof(cuComplex),cudaMemcpyHostToDevice);
delete[] G;
}
void GPU_fill2(cuComplex *A, int nr_rows_A, int nr_cols_A) {
cuComplex *G;
G = new cuComplex[nr_rows_A*nr_cols_A];
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
G[i].x = i%(nr_rows_A*nr_cols_A/(N_BIN));
G[i].y = i%(nr_rows_A*nr_cols_A/(N_BIN));
}
cudaMemcpy(A,G,nr_rows_A * nr_cols_A * sizeof(cuComplex),cudaMemcpyHostToDevice);
delete[] G;
}
void print_matrix(const cuComplex *A, int nr_rows_A, int nr_cols_A, int nr_sheets_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
for(int k = 0; k < nr_sheets_A; ++k){
// cout << A[j * nr_rows_A + i].x << "+" << A[j * nr_rows_A + i].y << "i" <<" ";
printf("%i,%i,%i: %e + %e i\n",i,j,k,A[k*nr_rows_A*nr_cols_A + j * nr_rows_A + i].x, A[k*nr_rows_A*nr_cols_A + j * nr_rows_A + i].y);
}
}
// cout << endl;
}
// cout << endl;
// for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
// printf("%i,: %e + %e i\n",i,A[i].x, A[i].y);
// }
}
void print_matrix2(const float *A, int nr_rows_A, int nr_cols_A) {
// for(int j = 0; j < nr_cols_A; ++j){
// for(int i = 0; i < nr_rows_A; ++i){
// //cout << A[j * nr_rows_A + i].x << "+" << A[j * nr_rows_A + i].y << "i" <<" ";
// printf("%i,%i: %e\n",i,j,A[j * nr_rows_A + i]);
// }
// cout << endl;
// }
// cout << endl;
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
printf("%i,: %e\n",i,A[i]);
}
}
static cuComplex * d_weights = NULL;
void update_weights(char * filename){
char weight_filename[128];
strcpy(weight_filename, filename);
FILE * weights;
float * bf_weights;
float complex * weights_dc;
float complex * weights_dc_n;
// Allocate heap memory for file data
bf_weights = (float *)malloc(2*N_WEIGHTS*sizeof(float));
weights_dc = (float complex *)malloc(N_WEIGHTS*sizeof(float complex *));
weights_dc_n = (float complex *)malloc(N_WEIGHTS*sizeof(float complex *));
weights = fopen(weight_filename, "r");
int j;
if (weights != NULL) {
fread(bf_weights, sizeof(float), 2*N_WEIGHTS, weights);
// Convert to complex numbers (do a conjugate at the same time)
for(j = 0; j < N_WEIGHTS; j++){
weights_dc_n[j] = bf_weights[2*j] - bf_weights[(2*j)+1]*I;
}
// Transpose the weights
int m,n;
float complex transpose[N_BEAM][N_ELE*N_BIN];
for(m=0;m<N_BEAM;m++){
for(n=0;n<N_ELE*N_BIN;n++){
transpose[m][n] = weights_dc_n[m*N_ELE*N_BIN + n];
}
}
for(n=0;n<N_ELE*N_BIN;n++){
for(m=0;m<N_BEAM;m++){
weights_dc[n*N_BEAM+ m] = transpose[m][n];
}
}
fclose(weights);
}
free(bf_weights);
// Copy weights to device
cudaMemcpy(d_weights, weights_dc, N_WEIGHTS*sizeof(cuComplex), cudaMemcpyHostToDevice); //r_weights instead of weights_dc //*N_TIME
free(weights_dc);
}
static cuComplex **d_arr_A = NULL; static cuComplex **d_arr_B = NULL; static cuComplex **d_arr_C = NULL;
static cuComplex * d_beamformed = NULL;
static cuComplex * d_data = NULL;
static cuComplex * d_data1 = NULL;
static float * d_outputs;
void init_beamformer(){
// Allocate memory for the weights, data, beamformer output, and sti output.
cudaMalloc((void **)&d_weights, N_WEIGHTS*sizeof(cuComplex)); //*N_TIME
cudaMalloc((void **)&d_data1, N_SAMP*sizeof(cuComplex));
cudaMalloc((void **)&d_data, N_SAMP*sizeof(cuComplex));
cudaError_t err_malloc = cudaMalloc((void **)&d_beamformed, N_TBF*sizeof(cuComplex));
if (err_malloc != cudaSuccess) {
printf("CUDA Error (cudaMalloc2): %s\n", cudaGetErrorString(err_malloc));
}
cudaMalloc((void **)&d_outputs, N_POL*(N_OUTPUTS*sizeof(float)/2));
// This is all memory allocated to arrays that are used by gemmBatched.
// Allocate 3 arrays on CPU
cudaError_t cudaStat;
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
nr_rows_A = N_BEAM;
nr_cols_A = N_ELE;
nr_rows_B = N_ELE;
nr_cols_B = N_TIME;
nr_rows_C = N_BEAM;
nr_cols_C = N_TIME;
// Allocate memory to host arrays.
const cuComplex **h_arr_A = 0; const cuComplex **h_arr_B = 0; cuComplex **h_arr_C = 0;
h_arr_A = (const cuComplex **)malloc(nr_rows_A * nr_cols_A *N_BIN*sizeof(const cuComplex*));
h_arr_B = (const cuComplex **)malloc(nr_rows_B * nr_cols_B *N_BIN*sizeof(const cuComplex*));
h_arr_C = (cuComplex **)malloc(nr_rows_C * nr_cols_C *N_BIN*sizeof(cuComplex*));
// Allocate memory for each batch in an array.
for(int i = 0; i < N_BIN; i++){
h_arr_A[i] = d_weights + i*nr_rows_A*nr_cols_A;
h_arr_B[i] = d_data + i*nr_rows_B*nr_cols_B;
h_arr_C[i] = d_beamformed + i*nr_rows_C*nr_cols_C;
}
// delete[] d_A;
// delete[] d_B;
// Allocate memory to arrays on device.
cudaStat = cudaMalloc((void **)&d_arr_A,nr_rows_A * nr_cols_A * N_BIN * sizeof(cuComplex*));
assert(!cudaStat);
cudaStat = cudaMalloc((void **)&d_arr_B,nr_rows_B * nr_cols_B * N_BIN * sizeof(cuComplex*));
assert(!cudaStat);
cudaStat = cudaMalloc((void **)&d_arr_C,nr_rows_C * nr_cols_C * N_BIN * sizeof(cuComplex*));
assert(!cudaStat);
// Copy memory from host to device.
cudaStat = cudaMemcpy(d_arr_A,h_arr_A,nr_rows_A * nr_cols_A * N_BIN * sizeof(cuComplex*),cudaMemcpyHostToDevice);
assert(!cudaStat);
cudaStat = cudaMemcpy(d_arr_B,h_arr_B,nr_rows_B * nr_cols_B * N_BIN * sizeof(cuComplex*),cudaMemcpyHostToDevice);
assert(!cudaStat);
cudaStat = cudaMemcpy(d_arr_C,h_arr_C,nr_rows_C * nr_cols_C * N_BIN * sizeof(cuComplex*),cudaMemcpyHostToDevice);
assert(!cudaStat);
}
__global__
void data_restructure(cuComplex * data, cuComplex * data_restruc){
int e = threadIdx.x;
int t = blockIdx.x;
int f = blockIdx.y;
//Restructure data so that the frequency bin is the slowest moving index
data_restruc[f*N_TIME*N_ELE + t*N_ELE + e] = data[t*N_BIN*N_ELE + f*N_ELE + e];
}
void data_in(char * input_filename){
FILE * data;
// File data pointers
float * bf_data;
// Complex data pointers
float complex * data_dc;
// Allocate heap memory for file data
bf_data = (float *)malloc(2*N_SAMP*sizeof(float));
data_dc = (float complex *)malloc(N_SAMP*sizeof(float complex *));
// Open files
data = fopen(input_filename, "r");
/*********************************************************
* Read in Data
*********************************************************/
if (data != NULL) {
fread(bf_data, sizeof(float), 2*N_SAMP, data);
int j;
// Make 'em complex!
for (j = 0; j < N_SAMP; j++) {
data_dc[j] = bf_data[2*j] + bf_data[(2*j)+1]*I;
}
// Specify grid and block dimensions
dim3 dimBlock_d(N_ELE, 1, 1);
dim3 dimGrid_d(N_TIME, N_BIN, 1);
cuComplex * d_data_in = d_data1;
cuComplex * d_data_out = d_data;
cudaMemcpy(d_data_in, data_dc, N_SAMP*sizeof(cuComplex), cudaMemcpyHostToDevice);
// Restructure data for cublasCgemmBatched function.
data_restructure<<<dimGrid_d, dimBlock_d>>>(d_data_in, d_data_out);
fclose(data);
}
free(bf_data);
free(data_dc);
}
void beamform(cublasHandle_t handle) {
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C;
nr_rows_A = N_BEAM;
nr_cols_A = N_ELE;
nr_rows_B = N_ELE;
nr_cols_B = N_TIME;
nr_rows_C = N_BEAM;
// Leading dimensions are always the rows of each matrix since the data is stored in a column-wise order.
int lda=nr_rows_A,ldb=nr_rows_B,ldc=nr_rows_C;
cuComplex alf;
cuComplex bet;
alf.x = 1;
alf.y = 0;
bet.x = 0;
bet.y = 0;
int batchCount = N_BIN; // There must be the same number of batches in each array.
cublasStatus_t stat;
/*
This function performs a matrix multiplication of the data and the weights.
Weights - d_arr_A, Data - d_arr_B, and the output - d_arr_C.
*/
stat = cublasCgemmBatched(
handle, // handle to the cuBLAS library context.
CUBLAS_OP_N, // Operation on matrices within array A.
CUBLAS_OP_N, // Operation on matrices within array B.
nr_rows_A, // Number of rows in matrix A and C.
nr_cols_B, // Number of columns in matrix B and C.
nr_cols_A, // Number of columns and rows in matrix A and B respectively.
&alf, // Scalar used for multiplication.
(const cuComplex **)d_arr_A, // Weight array of pointers.
lda, // Leading dimension of each batch or matrix in array A.
(const cuComplex **)d_arr_B, // Data array of pointers.
ldb, // Leading dimension of each batch or matrix in array B.
&bet, // Scalar used for multiplication.
(cuComplex **)d_arr_C, // Output array of pointers.
ldc, // Leading dimension of each batch or matrix in array C.
batchCount); // Number of batches in each array.
if(stat != CUBLAS_STATUS_SUCCESS){
cerr << "cublasCgemmBatched failed" << endl;
exit(1);
}
assert(!cudaGetLastError());
//Free GPU memory
// cudaFree(d_A);
// cudaFree(d_B);
// cudaFree(d_C);
// Destroy the handle
//cublasDestroy(handle);
}
__global__
void sti_reduction(cuComplex * data_in, float * data_out) {
int f = blockIdx.x;
int b = blockIdx.y;
int t = threadIdx.x;
int s = blockIdx.z;
int h = sample_idx(s*N_TIME_STI + t,b,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (First set of beams)
int h1 = sample_idx(s*N_TIME_STI + t,b+N_BEAM1,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (Last set of beams)
// Temporary variables used for updating.
float beam_power1;
float beam_power2;
float cross_power1;
float cross_power2;
cuFloatComplex samp1;
cuFloatComplex samp2;
float scale = 1.0/N_TIME_STI; // Scale power by number of samples per STI window.
__shared__ cuFloatComplex reduced_array1[N_STI_BLOC];
__shared__ cuFloatComplex reduced_array[N_STI_BLOC];
if (t < N_TIME_STI) {
// X polarization (XX*).
samp1.x = data_in[h].x;
samp1.y = data_in[h].y;
beam_power1 = (samp1.x * samp1.x) + (samp1.y * samp1.y); // Beamformer output multiplied by its conjugate (absolute value squared).
reduced_array[t].x = beam_power1;
// Y polarization (YY*).
samp2.x = data_in[h1].x;
samp2.y = data_in[h1].y;
beam_power2 = (samp2.x * samp2.x) + (samp2.y * samp2.y); // Beamformer output multiplied by its conjugate (absolute value squared).
reduced_array[t].y = beam_power2;
// Cross polarization (XY*).
cross_power1 = (samp1.x * samp2.x) + (samp1.y * samp2.y); // Real part of cross polarization.
cross_power2 = (samp1.y * samp2.x) - (samp1.x * samp2.y); // Imaginary part of cross polarization.
reduced_array1[t].x = cross_power1;
reduced_array1[t].y = cross_power2;
}
else{
reduced_array[t].x = 0.0;
reduced_array[t].y = 0.0;
reduced_array1[t].x = 0.0;
reduced_array1[t].y = 0.0;
}
__syncthreads();
// Reduction is performed by splitting up the threads in each block and summing them all up.
// The number of threads in each block needs to be a power of two in order for the reduction to work. (No left over threads).
for(int k = blockDim.x/2; k>0; k>>=1){
if(t<k){
reduced_array[t].x += reduced_array[t+k].x;
reduced_array[t].y += reduced_array[t+k].y;
reduced_array1[t].x += reduced_array1[t+k].x;
reduced_array1[t].y += reduced_array1[t+k].y;
}
__syncthreads();
}
// After reduction is complete, assign each reduced to value to appropriate position in output array.
if(t == 0){
data_out[output_idx(0,b,s,f)] = reduced_array[0].x*scale; // XX*.
data_out[output_idx(1,b,s,f)] = reduced_array[0].y*scale; // YY*.
data_out[output_idx(2,b,s,f)] = reduced_array1[0].x*scale; // XY* real.
data_out[output_idx(3,b,s,f)] = reduced_array1[0].y*scale; // XY* imaginary.
}
}
void run_beamformer(cublasHandle_t handle, float * data_out){
// Specify grid and block dimensions
dim3 dimBlock(N_STI_BLOC, 1, 1);
dim3 dimGrid(N_BIN, N_BEAM1, N_STI);
printf("Starting beamformer\n");
// Call beamformer function containing cublasCgemmBatched()
beamform(handle);
cudaError_t err_code = cudaGetLastError();
if (err_code != cudaSuccess) {
printf("CUDA Error (beamform): %s\n", cudaGetErrorString(err_code));
}
cuComplex * d_data_in = d_beamformed;
float * d_data_out = d_outputs;
printf("Starting sti_reduction\n");
// Call STI reduction kernel.
sti_reduction<<<dimGrid, dimBlock>>>(d_data_in, d_data_out);
printf("Finishing sti_reduction\n");
err_code = cudaGetLastError();
if (err_code != cudaSuccess) {
printf("CUDA Error (sti_reduction): %s\n", cudaGetErrorString(err_code));
}
// Copy output data from device to host.
cudaMemcpy(data_out, d_data_out, N_POL*(N_OUTPUTS*sizeof(float)/2),cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_outputs);
}