-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathThermQWalk.cuh
executable file
·213 lines (158 loc) · 8 KB
/
ThermQWalk.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#define THERMQWALK
#ifdef _MSC_VER
#define STRINGIZE_HELPER(x) #x
#define STRINGIZE(x) STRINGIZE_HELPER(x)
#define __MESSAGE(text) __pragma( message(__FILE__ "(" STRINGIZE(__LINE__) ")" text) )
#define WARNING(text) __MESSAGE( " : Warning: " #text )
#define ERROR(text) __MESSAGE( " : Error: " #text )
#define MESSAGE(text) __MESSAGE( ": " #text )
#define TODO(text) WARNING( TODO: text )
#endif
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuComplex.h>
#include <math_constants.h>
#include <curand_kernel.h>
#include <cuda_profiler_api.h>
#include "helper.h"
#include "input_parameters.cuh"
// real and complex types are used when high floating point precision might be required (depends on the choice of parameters)
// Set to float (cuFloatComplex) or double (cuDoubleComplex) to represent real and complex numbers
typedef double real;
typedef cuFloatComplex complex;
#define PI_DOUBLE 3.141592653589793
#define N_MODES (N_SITES-1) // number of spring / modes
#define N_SITES2 (N_SITES)*(N_SITES)
#define N_MODES2 (N_MODES)*(N_MODES)
#define HBAR 5.194037869154565e-29
#define BOLTZMANN 7.942974009171879e-32
#define FIRST_TARGET_NODE ((int) (N * (1.0-PERC_TARGET)))
#define SIM_DT (double(SIM_LENGTH)/(SIM_POINTS-1))
// Device dependent settings
#define ARCH_MAX_THREADS 1024
#define CHUNK_R 7680
#define R_CHUNKS ((R + CHUNK_R - 1) / CHUNK_R)
#define CHUNK_SIZE_K 1
#define CHUNK_SIZE_M 1
#define CHUNK_SIZE_T 21
#define CHUNK_SIZE_GAMMA 3
// Simulation settings
//#define SIM_VERBOSE // Define when want to output the deviation from unity of the sq. mod.
// of the numerical estimate of |psi(t+dt)>. WARNING: very high glob. mem. usage
/* Macros */
#define CUDASYNC() handleCudaError(cudaDeviceSynchronize())
/* Data structures */
struct DATAPOINT {
float k,T,gamma;
real m;
float perf_idx;
real time_idx;
bool operator< (const DATAPOINT& other) const {
if(gamma < other.gamma )
return true;
else if(other.gamma == this->gamma && T < other.T )
return true;
else if(other.gamma == this->gamma && other.T == this->T && m < other.m )
return true;
else if(other.gamma == this->gamma && other.T == this->T && other.m == this->m && k < other.k )
return true;
return false;
}
};
/* Complex numbers auxiliary functions */
__host__ __device__ static __inline__ cuFloatComplex cuCadd(cuFloatComplex x, cuFloatComplex y) {
return cuCaddf(x, y);
}
__host__ __device__ static __inline__ cuFloatComplex cuCmul(cuFloatComplex x, cuFloatComplex y) {
return cuCmulf(x, y);
}
// This function sets a float complex number given its real and imaginary parts as floats
template<typename T_A, typename T_B>
__host__ __device__ static __inline__ T_A cuCset(const T_B r, const T_B i) {
T_A c;
c.x = r;
c.y = i;
return c;
}
// This function scales a T_A complex number by a T_B type real number
template<typename T_A, typename T_B>
__host__ __device__ static __inline__ T_A cuCscale(const T_A a, const T_B b) {
T_A c;
c.x = a.x*b;
c.y = a.y*b;
return c;
}
// Constants
// Return real unit
__device__ __host__ static __inline__ cuComplex realOne() { return cuCset<cuComplex, float>(1.0f, 0.0f); }
// Differential costant: -i/hbar (hbar = 1 for our purposes)
__device__ __host__ static __inline__ cuComplex diffC() { return cuCset<cuComplex, float>(0.0f, -1.0f); }
// Device functions
template<int N>
__device__ constexpr real eigenvecNormFactor();
template<typename T, int stride, int size>
__device__ __forceinline__ void ZeroStridedVectors(const int glob_idx, T * const d_ptr);
template<int stride, int size>
__device__ __forceinline__ void ZeroStridedVectors(const int glob_idx, float2 * const d_ptr);
template<int stride, int size>
__device__ __forceinline__ void ZeroStridedVectors(const int glob_idx, double2 * const d_ptr);
template<typename T_A, typename T_B, int stride, int size>
__device__ __forceinline__ void AddScaledCVecToCVec(const int glob_idx, T_A * const a, const T_B b, T_A * const c,T_A * const out);
template<typename T_IN,typename T_OUT>
__device__ __forceinline__ T_OUT cuCSqrdAbs(const T_IN c);
template<typename T_IN,typename T_OUT, int stride, int size>
__device__ __forceinline__ T_OUT CVecSqrdAbs(const int glob_idx, T_IN * const vec);
template<typename T_IN,typename T_OUT, int stride, int size>
__device__ __forceinline__ T_OUT CVecNorm(const int glob_idx, T_IN * const vec);
template<typename T_C, typename T_R, int stride, int size>
__device__ __forceinline__ void CVecScale(const int glob_idx, T_C * const vec,const T_R scale, T_C * const out);
template<typename T, int stride, int size>
__device__ __forceinline__ void VecCopy(const int glob_idx, T * const in,T * const out);
template<int block_dim_x, int block_dim_y, int num_tot,int stride,int n, int N>
__device__ __forceinline__ void evalKappa(const int2 i, const int glob_idx, const real t, float * const d_A,real * const d_omega, float * const d_phi,const float gamma, real * const d_orthU, float * const d_osc, float * const d_out);
template<int block_dim_x, int block_dim_y,int num_tot,int stride,int n, int N>
__device__ __forceinline__ void diff(const int2 i, const int glob_idx, const real t, cuComplex * const d_psi,
float * const d_diag, float * const d_A,
real * const d_omega, float * const d_phi, const float gamma, real * const d_orthU,
float * const d_kappa, float * const d_osc, cuComplex * const d_out);
template<int block_dim_x, int block_dim_y,int num_tot,int stride, int n, int N>
__device__ __forceinline__ void rk4(const int2 i, const int glob_idx, const real t, const real dt,
cuComplex * const d_psi, float * const d_diag, float * const d_A,
real * const d_omega, float * const d_phi, const float gamma, real * const d_orthU,
float * const d_kappa, float * const d_osc, cuComplex * const d_K[4], cuComplex * const d_out);
template<int stride,int n, int N>
__device__ __forceinline__ float stateStdDev(const int glob_idx,
cuComplex * const d_psi);
template<int stride,int n, int N>
__device__ __forceinline__ float targetProbability(const int glob_idx,
cuComplex * const d_psi);
template<int num_k, int num_m, int num_T, int num_gamma, int num_tot>
__global__ void FindMaxPerfIdx(float * const k_array, real * const m_array,
float * const T_array, float * const gamma_array,
float * const d_avg_perf_idx, DATAPOINT * const out);
template<int num_tot,int threadsPerBlock>
__global__ void AveragePerfIdx(float * const d_perf_idx, float * const d_out);
template<int block_dim_x, int block_dim_y, int stride_x, int stride_y, int num_k, int num_m, int num_T, int num_gamma, int num_tot,int n, int N>
__global__ void RunSim(float * const k_array, real * const m_array,
float * const T_array, float * const gamma_array,
cuComplex * const d_psi, float * const d_diag, float * const d_A,
real * const d_omega, float * const d_phi, real * const d_orthU,
float * const d_kappa, float * const d_osc, cuComplex * const d_K[4], cuComplex * const d_psi_next,
float * const d_perf_idx /* vector of num_tot * CHUNK_R * SIM_POINTS performance indexes */,
float * const d_deviations /* (optional) deviation from 1 of norm of est. |psi(t+dt)> */);
template<int N>
__global__ void SampleDisorderVec(const float disorder_param, float * const d_diag, curandState * const states);
template<int stride_x, int stride_y, int num_k, int num_m, int num_T, int num_gamma, int num_tot, int n, int N>
__global__ void SampleAmplitudesAndPhases(float * const k_array, real * const m_array,
float * const T_array,
float * const d_A, real * const d_omega, float * d_phi,
curandState * const states);
__global__ void InitRNG(curandState *states, int seed, int num_gen);
template<int n, int N, int n2>
__global__ void StoreOrthU(real * const d_ptr);
template<typename T>
__global__ void VecInit(T * const d_ptr, const T val, const int D);