From 489c073d271acfe6449b8a5dfadf4e602a5b9421 Mon Sep 17 00:00:00 2001 From: Neil Kichler Date: Wed, 16 Oct 2024 17:48:45 +0200 Subject: [PATCH] further failed attempts to have shared memory work beyond 1 tangent at the same time --- examples/mccormick/mccormick_v.cu | 309 ++++++++++++++++++- include/cutangent/arithmetic/intrinsic_v.cuh | 62 +++- 2 files changed, 352 insertions(+), 19 deletions(-) diff --git a/examples/mccormick/mccormick_v.cu b/examples/mccormick/mccormick_v.cu index d553b5c..a2b181e 100644 --- a/examples/mccormick/mccormick_v.cu +++ b/examples/mccormick/mccormick_v.cu @@ -15,7 +15,7 @@ using cu::tangents; -#define PRINT_DEBUG 0 +#define PRINT_DEBUG 1 #define USE_VECTOR_LOAD_128 0 #define USE_VECTOR_LOAD_256 0 @@ -23,7 +23,7 @@ using cu::tangents; constexpr __device__ auto f(const auto &x, const auto &y, const auto &z, const auto &w) { auto print = [](auto &x) { -#if 1 || PRINT_DEBUG +#if 1 && PRINT_DEBUG printf("[gid:%3d][bid:%3d][tid:%3d] f {v {%g, %g, %g, %g}, cv [%g, %g, %g, %g] cc [%g, %g, %g, %g] lb [%g, %g, %g, %g] ub [%g, %g, %g, %g]}\n", threadIdx.x + blockIdx.x * blockDim.x, blockIdx.x, threadIdx.x, x.cv.v, @@ -43,7 +43,7 @@ constexpr __device__ auto f(const auto &x, const auto &y, const auto &z, const a // auto a = x + y; // cu::mccormick> a; - print(x); + // print(x); // print(y); auto a = x + y + z + w; // auto b = a + a + a + a; @@ -89,7 +89,8 @@ kernel(T *in, T *out, int n_elems, int n_vars) // TODO: unroll for (int i = tid + t_block_start; i < t_block_end; i += n_threads) { int v = i / n_doubles_per_mc; -#if PRINT_DEBUG +#if 0 && PRINT_DEBUG + printf("[gid:%3d][bid:%3d][tid:%3d][t_bstart:%d] i: %3d\n", gid, bid, tid, t_block_start, i); printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][xid:%3d][i:%3d] n_elems_tangent: %3d t_block_start: %3d t_block_end: %3d\n", gid, bid, tid, v, xid, i, n_elems_per_block_with_tangents, t_block_start, t_block_end); #endif @@ -99,14 +100,17 @@ kernel(T *in, T *out, int n_elems, int n_vars) bool is_cv_or_cc = i % n_doubles_per_mc < 2 * (N + 1); // 2 since we only seed cv and cc ((double *)xs)[i - t_block_start] = (v % n_vars == tangent_idx) && is_cv_or_cc ? 1.0 : 0.0; -#if PRINT_DEBUG - // printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d] tangent seed value: %g\n", - // gid, bid, tid, v, tangent_idx, xid, i - t_block_start, ((double *)xs)[i - t_block_start]); +#if 0 && PRINT_DEBUG + printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d][t_bstart:%3d] tangent seed value: %g\n", + gid, bid, tid, v, tangent_idx, xid, i - t_block_start, t_block_start, ((double *)xs)[i - t_block_start]); #endif } __syncthreads(); +#if 1 + + // Load elements from global memory into shared memory trying to get a balanced allocation in all blocks #if USE_VECTOR_LOAD_128 for (int i = tid * 2 + block_start; i + 1 < block_end; i += n_threads) { @@ -190,12 +194,14 @@ kernel(T *in, T *out, int n_elems, int n_vars) for (int i = tid + out_block_start; i < out_block_end; i += n_threads) { int sid = out_sh_mem_offset + i - out_block_start; out[i] = ((double *)xs)[sid]; -#if 0 || PRINT_DEBUG +#if 0 && PRINT_DEBUG printf("[gid:%3d][bid:%3d][tid:%3d][bstart:%3d][bend:%3d] copy shared [%3d] (bank: [%3d]) into global [%3d] value: %g\n", gid, bid, tid, out_block_start, out_block_end, sid, sid % 32, i, out[i]); #endif } +#endif + // // Other tangent iterations (if n_tangents < n_vars) // @@ -264,7 +270,7 @@ kernel(T *in, T *out, int n_elems, int n_vars) int out_idx = i + tangent_out_offset; out[out_idx] = ((double *)xs)[sid]; // TODO: skip value in copy (already have it) -#if 1 || PRINT_DEBUG +#if 0 && PRINT_DEBUG printf("[gid:%3d][bid:%3d][tid:%3d][bstart:%3d][bend:%3d][k:%d] copy shared [%3d] (bank: [%3d]) into global [%3d] value: %g\n", gid, bid, tid, out_block_start, out_block_end, k, sid, sid % 32, out_idx, out[out_idx]); #endif @@ -288,9 +294,281 @@ kernel(T *in, T *out, int n_elems, int n_vars) we should first try to make use of all the blocks individually doing one mccormick computation (1a). */ +// struct range +// { +// int begin; +// int end; +// }; +// +// constexpr range value_range(int bid, int n_elems, int n_blocks, int n_vars) +// { +// int n = n_elems * n_vars; +// +// // block range when considering only mccormick values (for initial copy from global memory) +// // int n_elems_per_block = (n_elems + n_blocks - 1) >> int(log2(n_blocks)); +// int n_elems_per_block = (n_elems + n_blocks - 1) / n_blocks; +// int block_start = n_elems_per_block * bid * 4 * n_vars; +// int block_end = std::min(block_start + n_elems_per_block * 4 * n_vars, n * 4); +// +// return { block_start, block_end }; +// }; + + +#if 0 +__device__ void debug_sync() +{ + __syncthreads(); +} + +__device__ void real_sync() +{ + __syncthreads(); +} +extern __shared__ double xs[]; + +template +__shared__ tangents *tangents_block_buffer; + +template +__global__ void +kernel(T *in, T *out, int n_elems, int n_vars) +{ + cu::mccormick> *xs_mc = (cu::mccormick> *)xs; + + assert(N <= n_vars && "number of tangents (N) must be <= number of variables (n_vars)"); + // might want to assert that N should be power of 2, and ideally multiple of 32 + + const int n = n_elems * n_vars; // total number of mccormick variables across all elements + const int gid = threadIdx.x + blockIdx.x * blockDim.x; // global id + const int bid = blockIdx.x; // block id + const int tid = threadIdx.x; // thread id inside block + const int n_threads = blockDim.x; // number of threads in a block + const int n_blocks = gridDim.x; // number of blocks in the grid TODO: should probably be power of two for fast % operation + const int n_doubles_per_mc = 4 * (N + 1); // 4 for cv, cc, lb, ub + const int n_out_doubles_per_mc = 2 * (N + 1); // 2 for cv, cc + const int xid = gid / n_vars; // mccormick id in xs + + const int n_elems_per_block = (n_elems + n_blocks - 1) / n_blocks; + + if (tid == 0) { + printf("[gid:%4d][bid:%3d][tid:%3d][xid:%3d] n_elems_per_block: %3d\n", + gid, bid, tid, xid, n_elems_per_block); + } + + const int block_start = n_elems_per_block * bid * 4 * n_vars; + const int block_end = min(block_start + n_elems_per_block * 4 * n_vars, n * 4); + + if (tid == 0) + printf("[gid:%4d][bid:%3d][tid:%3d][xid:%3d] elems_per_block: %3d block_start: %4d block_end: %4d\n", + gid, bid, tid, xid, n_elems_per_block, block_start, block_end); + + // block range when considering tangents as well + const int n_elems_per_block_with_tangents = n_elems_per_block * n_vars * n_doubles_per_mc; + const int t_block_start = n_elems_per_block_with_tangents * bid; + const int t_block_end = min(t_block_start + n_elems_per_block_with_tangents, n_elems * n_vars * n_doubles_per_mc); + + if (tid == 0) + printf("[gid:%4d][bid:%3d][tid:%3d][xid:%3d] elems_per_block_w_tan: %3d t_block_start: %4d t_block_end: %4d\n", + gid, bid, tid, xid, n_elems_per_block_with_tangents, t_block_start, t_block_end); + + debug_sync(); + + // All the configurations that need to be checked: + // + // 1. n_tangents == n_vars [x] + // 2. n_tangents < n_vars [x] + // 3. n_tangents > n_vars -> assert [x] + // + // - n_blocks < n_elems [x] + // - n_blocks == n_elems [x] + // - n_blocks > n_elems [x] + // + // - n_threads < n_vars [x] + // - n_threads == n_vars [x] + // - n_threads > n_vars [x] + int iter = 0; + for (int i = tid + t_block_start; i < t_block_end; i += n_threads) { + int v = i / n_doubles_per_mc; + int tangent_idx = (i % (N + 1)) - 1; // tangent index for this thread, -1 is no tangent but a value to be skipped + // faster alternative: int tangent_idx = x - floor(1/(N+1) * x) * (N+1) - 1; // TODO: check if compiler figures this out + bool is_cv_or_cc = i % n_doubles_per_mc < 2 * (N + 1); // 2 since we only seed cv and cc + ((double *)xs)[i - t_block_start] = (v % n_vars == tangent_idx) && is_cv_or_cc ? 1.0 : 0.0; + + // if (bid == 0 && iter == 0) { + // if (bid == 0) { + // printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d][t_bstart:%3d][iter:%3d] tangent seed value: %g\n", + // gid, bid, tid, v, tangent_idx, xid, i - t_block_start, t_block_start, iter, ((double *)xs)[i - t_block_start]); + // } + + iter++; + } + + real_sync(); + + // All the configurations that need to be checked: + // + // 1. n_tangents == n_vars [x] + // 2. n_tangents < n_vars [x] + // 3. n_tangents > n_vars -> assert [x] + // + // - n_blocks < n_elems [x] + // - n_blocks == n_elems [x] + // - n_blocks > n_elems [x] + // + // - n_threads < n_vars [x] + // - n_threads == n_vars [x] + // - n_threads > n_vars [x] + + // Load elements from global memory into shared memory trying to get a balanced allocation in all blocks +#if USE_VECTOR_LOAD_128 + for (int i = tid * 2 + block_start; i + 1 < block_end; i += n_threads) { + int sid = (i - block_start) * (N + 1); + double2 tmp = *(double2 *)&in[i]; // init value + ((double *)xs)[sid] = tmp.x; + ((double *)xs)[sid + (N + 1)] = tmp.y; + // printf("[gid:%3d][bid:%3d][tid:%3d][i:%3d] init value: %g\n", gid, bid, tid, i, in[i]); + } +#elif USE_VECTOR_LOAD_256 + for (int i = tid * 4 + block_start; i + 3 < block_end; i += n_threads) { + int sid = (i - block_start) * (N + 1); + double4 tmp = *(double4 *)&in[i]; // init value + ((double *)xs)[sid] = tmp.x; + ((double *)xs)[sid + (N + 1)] = tmp.y; + ((double *)xs)[sid + 2 * (N + 1)] = tmp.z; + ((double *)xs)[sid + 3 * (N + 1)] = tmp.w; + // printf("[gid:%3d][bid:%3d][tid:%3d][i:%3d] init value: %g\n", gid, bid, tid, i, in[i]); + } +#else + for (int i = tid + block_start; i < block_end; i += n_threads) { + int sid = (i - block_start) * (N + 1); + ((double *)xs)[sid] = in[i]; // init value + // printf("[gid:%3d][bid:%3d][tid:%3d][sid:%4d][i:%3d] init value: %g\n", gid, bid, tid, sid, i, in[i]); + } +#endif + + real_sync(); + + // Actual computation + int compute_out_offset_idx = n_elems_per_block * n_vars; // index offset into shared memory + int compute_block_start = n_elems_per_block * bid * N; + int compute_block_end = min(compute_block_start + n_elems_per_block * N, n_elems * N); + + // tangents_block_buffer = &((tangents *)xs)[compute_out_offset_idx]; + // + // if (tid == 0) { + // printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] tangents_block_buffer: %p, el: %g\n", gid, bid, tid, xid, tangents_block_buffer, tangents_block_buffer[0].v); + // printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] xs: %p, xs[0]: %g\n", gid, bid, tid, xid, xs, xs[tid]); + // } + +#if 1 + iter = 0; + // TODO: unroll + // for (int i = tid + block_start; i < block_end; i += n_threads) { + for (int i = tid + compute_block_start; i < compute_block_end; i += n_threads) { + + int lid = xid * n_vars - bid * n_threads; // local variable id offset + int sid = (i - t_block_start) % N; // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N) + int rid = compute_out_offset_idx + lid / n_vars; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents + int vid = bid * n_elems_per_block + tid / N; + + if (bid == 0) { + printf("[gid:%3d][bid:%3d][tid:%3d][lid:%3d][sid:%3d][rid:%3d][vid:%3d][i:%3d][iter:%3d] inputs (cv.v): %g %g %g %g\n", + gid, bid, tid, lid, sid, rid, vid, i, iter, xs_mc[lid].cv.v, xs_mc[lid + 1].cv.v, xs_mc[lid + 2].cv.v, xs_mc[lid + 7].cv.v); + } + + // if (bid > 0) + // continue; + + // TODO: this can be replaced by a single pointer input to &xs[lid] + // in fact, here should reside a cuda device graph + auto res = f(xs_mc[lid], xs_mc[lid + 1], xs_mc[lid + 2], xs_mc[lid + 7]); + + iter++; + } + +#elif 0 + // + // if (tid == 0) + // printf("[gid:%4d][bid:%3d][tid:%3d][xid:%3d] compute_out_offset_idx: %3d compute_block_start: %4d compute_block_end: %4d\n", + // gid, bid, tid, xid, compute_out_offset_idx, compute_block_start, compute_block_end); + + // All the configurations that need to be checked: + // + // 1. n_tangents == n_vars [ ] + // 2. n_tangents < n_vars [ ] + // 3. n_tangents > n_vars -> assert [x] + // + // - n_blocks < n_elems [ ] + // - n_blocks == n_elems [ ] + // - n_blocks > n_elems [ ] + // + // - n_threads < n_vars [ ] + // - n_threads == n_vars [ ] + // - n_threads > n_vars [ ] + + iter = 0; + // TODO: unroll + // for (int i = tid + block_start; i < block_end; i += n_threads) { + for (int i = tid + compute_block_start; i < compute_block_end; i += n_threads) { + + // TODO: lid is wrong + int lid = xid * n_vars - bid * n_threads; // local variable id + // int lid = bid * n_elems_per_block; // local variable id + int sid = (i - t_block_start) % N; // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N) + int rid = compute_out_offset_idx + lid / n_vars; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents + // int rid = compute_out_offset_idx + lid; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents + + // int vid = xid - bid * N; + // int vid = gid / n_threads + bid * n_elems_per_block; + + // int lid = bid * n_elems_per_block + (tid / N) * n_vars; // local variable id + // int sid = (i - compute_block_start); // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N) + int vid = bid * n_elems_per_block + tid / N; + + if (bid == 0) { + printf("[gid:%3d][bid:%3d][tid:%3d][lid:%3d][sid:%3d][rid:%3d][vid:%3d][i:%3d][iter:%3d] inputs (cv.v): %g %g %g %g\n", + gid, bid, tid, lid, sid, rid, vid, i, iter, xs[lid].cv.v, xs[lid + 1].cv.v, xs[lid + 2].cv.v, xs[lid + 7].cv.v); + } + + // if (bid > 0) + // continue; + + // TODO: this can be replaced by a single pointer input to &xs[lid] + auto res = f(xs[lid], xs[lid + 1], xs[lid + 2], xs[lid + 7]); + +#if 1 + xs[rid].cv.ds[sid] = res.cv.ds[sid]; + xs[rid].cc.ds[sid] = res.cc.ds[sid]; + xs[rid].box.lb.ds[sid] = res.box.lb.ds[sid]; + xs[rid].box.ub.ds[sid] = res.box.ub.ds[sid]; + + // TODO: if we let every thread do the update, it might be a broadcast operation into shared memory + // and thus the same speed without the if statement. + if (sid % N == 0) { + // put res.v into shared memory + xs[rid].cv.v = res.cv.v; + xs[rid].cc.v = res.cc.v; + xs[rid].box.lb.v = res.box.lb.v; + xs[rid].box.ub.v = res.box.ub.v; + } + if (bid == 0) + printf("A [gid:%3d][bid:%3d][tid:%3d][rid:%3d][vid:%3d][xid:%3d][lid:%3d][sid:%3d][iter:%3d] in.v: %g %g %g %g, res is: %g %g %g %g %g\n", + gid, bid, tid, rid, vid, xid, lid, sid, iter, + xs[lid].cv.v, xs[lid + 1].cv.v, xs[lid + 2].cv.v, xs[lid + 7].cv.v, + res.cv.v, + res.cv.ds[sid], res.cc.ds[sid], res.box.lb.ds[sid], res.box.ub.ds[sid]); +#endif + iter++; + } +#endif +} + +#endif + int main() { - constexpr int n_elems = 40; + // constexpr int n_elems = 31; + constexpr int n_elems = 8; constexpr int n_vars = 16; constexpr int n = n_elems * n_vars; constexpr int n_copy_doubles_per_mc = 2 * (n_vars + 1); // the number of doubles to copy from device back to host per mccormick relaxation. Skips box derivatives. Take cv, cc, lb, ub, cv.ds, cc.ds @@ -298,15 +576,18 @@ int main() constexpr int n_blocks = 10; constexpr int n_threads = 256; + // constexpr int n_threads = n_vars; + // constexpr int n_threads = 8; constexpr int n_tangents = 16; // the number of tangents to perform per mccormick relaxation, a multiple of 32 is ideal constexpr int n_elems_per_block = std::ceil(double(n_elems) / n_blocks); constexpr int n_vars_per_block = n_vars * n_elems_per_block; // the number of mccormick variables to access in shared memory per block - assert(n_mccormick >= n_vars && "n_mccormick must be >= n_vars"); static_assert(n_tangents <= n_vars, "n_tangents must be <= n_vars"); + static_assert(n_blocks >= n_elems, "n_blocks must be >= n_elems for now"); + cu::mccormick xs[n_elems * n_vars] {}; double res[n_elems * n_copy_doubles_per_mc] {}; @@ -318,7 +599,7 @@ int main() } } -#if PRINT_DEBUG +#if 0 && PRINT_DEBUG for (auto x : xs) { std::cout << "x is: " << x << std::endl; } @@ -327,7 +608,7 @@ int main() constexpr int n_bytes_shared_in = n_vars_per_block * 4 * sizeof(double) * (n_tangents + 1); constexpr int n_bytes_shared_out = n_elems_per_block * 4 * sizeof(double) * (n_tangents + 1); constexpr int n_bytes_shared = n_bytes_shared_in + n_bytes_shared_out; - printf("n_bytes_shared = %d B\n", n_bytes_shared); + printf("n_bytes_shared = %g KiB\n", n_bytes_shared / 1024.0); double *d_xs; // we only use a single double array for easier coalescing double *d_res; // same as above @@ -343,7 +624,7 @@ int main() CUDA_CHECK(cudaMemcpy(res, d_res, (n_elems * n_copy_doubles_per_mc) * sizeof(*d_res), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaDeviceSynchronize()); -#if 1 || PRINT_DEBUG +#if 1 && PRINT_DEBUG for (auto r : res) { std::cout << r << std::endl; } diff --git a/include/cutangent/arithmetic/intrinsic_v.cuh b/include/cutangent/arithmetic/intrinsic_v.cuh index 03dae30..853e123 100644 --- a/include/cutangent/arithmetic/intrinsic_v.cuh +++ b/include/cutangent/arithmetic/intrinsic_v.cuh @@ -48,18 +48,51 @@ using cu::tangents; // // template<> inline __device__ tangent fma_down (tangent x, tangent y, tangent z) { return x * y + z; } // template<> inline __device__ tangent fma_up (tangent x, tangent y, tangent z) { return x * y + z; } + + +#if 0 +extern __shared__ double xs[]; + +template +__shared__ tangents *tangents_block_buffer; + template inline __device__ tangents add_down(const tangents &x, const tangents &y) { - __shared__ tangents res; - res.v = add_down(x.v, y.v); + // __shared__ tangents res[4]; + + printf("intrinsic tangents_block_buffer: %p, el: %g\n", + tangents_block_buffer, tangents_block_buffer[0].v); + + // if (tid == 0) { + // printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] xs: %p, xs[0]: %g\n", gid, bid, tid, xid, xs, xs[tid]); + // } + + // tangents res = (tangents *)xs; // multiple mccormick relaxations in one block int gid = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = gid % N; i < N; i += blockDim.x * gridDim.x) { - // printf("[gid: %3d][bid: %3d][tid: %3d] i: %3d\n", gid, blockIdx.x, threadIdx.x, i); - res.ds[i] = add_down(x.ds[i], y.ds[i]); + + int rid = gid / N; + + #if 0 + // if (gid < N) { + if (gid > N and gid < 2 * N) { + + res[rid].v = add_down(x.v, y.v); + + for (int i = gid % N; i < N; i += blockDim.x * gridDim.x) { + // if (threadIdx.x == 0) { + printf("[gid: %4d][bid: %3d][tid: %3d][N:%3d] x.cv.v %g, y.cv.v %g, res.cv.v: %g\n", + gid, blockIdx.x, threadIdx.x, N, x.v, y.v, res.v); + // } + + + res[rid].ds[i] = add_down(x.ds[i], y.ds[i]); + } + } + #endif // if (threadIdx.x == 0) { // res.v = add_down(x.v, y.v); @@ -78,9 +111,28 @@ inline __device__ tangents add_down(const tangents &x, con // for (int i = 0; i < N; i++) { // res.ds[i] = add_down(x.ds[i], y.ds[i]); // } + return {}; + // return res; +} +#else + +template +inline __device__ tangents add_down(const tangents &a, const tangents &b) +{ + __shared__ tangents res; + res.v = add_down(a.v, b.v); + + // multiple mccormick relaxations in one block + int gid = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = gid % N; i < N; i += blockDim.x * gridDim.x) { + res.ds[i] = add_down(a.ds[i], b.ds[i]); + } return res; } + +#endif + template inline __device__ tangents add_up(const tangents &a, const tangents &b) {