-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathcodeGen_convRunner.sh
executable file
·145 lines (140 loc) · 6.06 KB
/
codeGen_convRunner.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
echo '#include "cuda.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <opencv2/opencv.hpp>
#include "convolution.h"
#include "helpers.h"
using namespace std;
using namespace cv;'
echo 'void testConvolution()
{
cv::Mat img = getRawImage("./Lena.pgm");
img.convertTo(img, CV_32FC1);'
for memoryScheme in "global_only"
do
for((kernelSize=2; kernelSize<8; kernelSize++)) do
for((sqrtConvsPerThread=1; sqrtConvsPerThread<8; sqrtConvsPerThread++)) do
amountToLoad=$(($sqrtConvsPerThread+$kernelSize-1))
echo " convolutionWrapper((float*)&img.data[0], img.cols, img.rows, $amountToLoad, $kernelSize, \"$memoryScheme\", true, \"results/kernel${kernelSize}x${kernelSize}_size${amountToLoad}x${amountToLoad}_${memoryScheme}.png\");"
done
done
done
for memoryScheme in "global_register" #"global_shared_register"
do
for((kernelSize=2; kernelSize<8; kernelSize++)) do
for((amountToLoad=$kernelSize; amountToLoad<8; amountToLoad++)) do
echo " convolutionWrapper((float*)&img.data[0], img.cols, img.rows, $amountToLoad, $kernelSize, \"$memoryScheme\", true, \"results/kernel${kernelSize}x${kernelSize}_size${amountToLoad}x${amountToLoad}_${memoryScheme}.png\");"
done
done
done
for memoryScheme in "texCache_only"
do
for((kernelSize=2; kernelSize<8; kernelSize++)) do
amountToLoad=$kernelSize
echo " convolutionWrapper_texCache((float*)&img.data[0], img.cols, img.rows, $amountToLoad, $kernelSize, \"$memoryScheme\", true, \"results/kernel${kernelSize}x${kernelSize}_size${amountToLoad}x${amountToLoad}_${memoryScheme}.png\");"
done
done
for memoryScheme in "texCache_register"
do
for((kernelSize=2; kernelSize<8; kernelSize++)) do
for((amountToLoad=$kernelSize; amountToLoad<8; amountToLoad++)) do
echo " convolutionWrapper_texCache((float*)&img.data[0], img.cols, img.rows, $amountToLoad, $kernelSize, \"$memoryScheme\", true, \"results/kernel${kernelSize}x${kernelSize}_size${amountToLoad}x${amountToLoad}_${memoryScheme}.png\");"
done
done
done
echo '}
void testConvolution_withDummyImg(int height, int width)
{
float* img = getDummyImg(height, width);
FILE * pFile = fopen("perf.txt", "w");
fprintf(pFile, "kernelSize amountToLoad memoryScheme responseTime\n");
int nRuns = 10;
float responseTime = 0;
responseTime = convolutionWrapper(img, width, height, 3, 3, "global_register", false); //warmup'
for memoryScheme in "global_only" #"global_register_BenchmarkComputeTime"
do
echo " printf(\"memoryScheme = %s \n\", \"$memoryScheme\");
for(int kernelSize=2; kernelSize<8; kernelSize++)
{
for(int sqrtConvsPerThread=1; sqrtConvsPerThread<8; sqrtConvsPerThread++)
{
int amountToLoad = sqrtConvsPerThread+kernelSize-1; //actually, prefetching nothing in this version
responseTime = 0;
for(int i=0; i<nRuns; i++)
{
float tmpTime = convolutionWrapper(img, width, height, amountToLoad, kernelSize, \"$memoryScheme\", false);
responseTime += tmpTime;
}
responseTime = responseTime/nRuns;
fprintf(pFile, \"%d, %d, %s, %f \\n\", kernelSize, amountToLoad, \"$memoryScheme\", responseTime);
printf(\"kernelSize = %d. amountToLoad = %d. time per Convolution = %f seconds \n\", kernelSize, amountToLoad, responseTime);
cudaDeviceSynchronize();
}
printf(\"\n\");
}"
done
for memoryScheme in "global_register" #"global_shared_register"
do
echo " printf(\"memoryScheme = %s \n\", \"$memoryScheme\");
for(int kernelSize=2; kernelSize<8; kernelSize++)
{
for(int amountToLoad=kernelSize; amountToLoad<8; amountToLoad++)
{
responseTime = 0;
for(int i=0; i<nRuns; i++)
{
float tmpTime = convolutionWrapper(img, width, height, amountToLoad, kernelSize, \"$memoryScheme\", false);
responseTime += tmpTime;
}
responseTime = responseTime/nRuns;
fprintf(pFile, \"%d, %d, %s, %f \\n\", kernelSize, amountToLoad, \"$memoryScheme\", responseTime);
printf(\"kernelSize = %d. amountToLoad = %d. time per Convolution = %f seconds \n\", kernelSize, amountToLoad, responseTime);
cudaDeviceSynchronize();
}
printf(\"\n\");
}"
done
for memoryScheme in "texCache_only"
do
echo " printf(\"memoryScheme = %s \n\", \"$memoryScheme\");
for(int kernelSize=2; kernelSize<8; kernelSize++)
{
int amountToLoad = kernelSize;
responseTime = 0;
for(int i=0; i<nRuns; i++)
{
float tmpTime = convolutionWrapper_texCache(img, width, height, amountToLoad, kernelSize, \"$memoryScheme\", false);
responseTime += tmpTime;
}
responseTime = responseTime/nRuns;
fprintf(pFile, \"%d, %d, %s, %f \\n\", kernelSize, amountToLoad, \"$memoryScheme\", responseTime);
printf(\"kernelSize = %d. amountToLoad = %d. time per Convolution = %f seconds \n\", kernelSize, amountToLoad, responseTime);
cudaDeviceSynchronize();
printf(\"\n\");
}"
done
for memoryScheme in "texCache_register" #"texCache_shared_register"
do
echo " printf(\"memoryScheme = %s \n\", \"$memoryScheme\");
for(int kernelSize=2; kernelSize<8; kernelSize++)
{
for(int amountToLoad=kernelSize; amountToLoad<8; amountToLoad++)
{
responseTime = 0;
for(int i=0; i<nRuns; i++)
{
float tmpTime = convolutionWrapper_texCache(img, width, height, amountToLoad, kernelSize, \"$memoryScheme\", false);
responseTime += tmpTime;
}
responseTime = responseTime/nRuns;
fprintf(pFile, \"%d, %d, %s, %f \\n\", kernelSize, amountToLoad, \"$memoryScheme\", responseTime);
printf(\"kernelSize = %d. amountToLoad = %d. time per Convolution = %f seconds \n\", kernelSize, amountToLoad, responseTime);
cudaDeviceSynchronize();
}
printf(\"\n\");
}
fclose(pFile);"
done
echo '}'