Skip to content

Commit

Permalink
current state
Browse files Browse the repository at this point in the history
  • Loading branch information
neilkichler committed Oct 14, 2024
1 parent 8b08806 commit ac300be
Showing 1 changed file with 114 additions and 21 deletions.
135 changes: 114 additions & 21 deletions examples/mccormick/mccormick_v.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ using cu::tangents;
constexpr __device__ auto f(const auto &x, const auto &y, const auto &z, const auto &w)
{
auto print = [](auto &x) {
#if PRINT_DEBUG
#if 1 || PRINT_DEBUG
printf("[gid:%3d][bid:%3d][tid:%3d] f {v {%g, %g, %g, %g}, cv [%g, %g, %g, %g] cc [%g, %g, %g, %g] lb [%g, %g, %g, %g] ub [%g, %g, %g, %g]}\n",
threadIdx.x + blockIdx.x * blockDim.x, blockIdx.x, threadIdx.x,
x.cv.v,
Expand All @@ -44,19 +44,18 @@ constexpr __device__ auto f(const auto &x, const auto &y, const auto &z, const a
// cu::mccormick<tangents<double, N>> a;

print(x);
print(y);
// print(y);
auto a = x + y + z + w;
auto b = a + a + a + a;
auto c = b + b + b;
auto d = c + c;
// auto b = a + a + a + a;
// auto c = b + b + b;
// auto d = c + c;
// print(a);

return d;
return a;
}

template<typename T, int N>
__global__ void
// __launch_bounds__(256, 1)
kernel(T *in, T *out, int n_elems, int n_vars)
{
extern __shared__ cu::mccormick<cu::tangents<T, N>> xs[];
Expand All @@ -82,9 +81,9 @@ kernel(T *in, T *out, int n_elems, int n_vars)
int t_block_start = n_elems_per_block_with_tangents * bid;
int t_block_end = min(t_block_start + n_elems_per_block_with_tangents, n_elems * n_vars * n_doubles_per_mc);

// if (tid == 0)
// printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] elems_per_block: %3d block_start: %3d block_end: %3d\n",
// gid, bid, tid, xid, n_elems_per_block, block_start, block_end);
if (tid == 0)
printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] elems_per_block: %3d block_start: %3d block_end: %3d\n",
gid, bid, tid, xid, n_elems_per_block, block_start, block_end);

// seed tangents
// TODO: unroll
Expand All @@ -101,8 +100,8 @@ kernel(T *in, T *out, int n_elems, int n_vars)
((double *)xs)[i - t_block_start] = (v % n_vars == tangent_idx) && is_cv_or_cc ? 1.0 : 0.0;

#if PRINT_DEBUG
printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d] tangent seed value: %g\n",
gid, bid, tid, v, tangent_idx, xid, i - t_block_start, ((double *)xs)[i - t_block_start]);
// printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d] tangent seed value: %g\n",
// gid, bid, tid, v, tangent_idx, xid, i - t_block_start, ((double *)xs)[i - t_block_start]);
#endif
}

Expand Down Expand Up @@ -143,24 +142,41 @@ kernel(T *in, T *out, int n_elems, int n_vars)
int compute_block_end = min(compute_block_start + n_elems_per_block * N, n_elems * N);
// TODO: unroll
for (int i = tid + compute_block_start; i < compute_block_end; i += n_threads) {
int lid = xid * n_vars - bid * n_threads; // local variable id
// TODO: lid is wrong
int lid = xid * n_vars - bid * n_threads; // local variable id
// int lid = bid * n_elems_per_block; // local variable id
int sid = (i - compute_block_start) % N; // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
int rid = compute_out_offset + lid / n_vars; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents
// int rid = compute_out_offset + lid; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents

// int vid = xid - bid * N;
// int vid = gid / n_threads + bid * n_elems_per_block;

// int lid = bid * n_elems_per_block + (tid / N) * n_vars; // local variable id
// int sid = (i - compute_block_start); // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
int vid = bid * n_elems_per_block + tid / N;

auto res = f(xs[lid], xs[lid + 1], xs[lid + 2], xs[lid + 3]);
auto res = f(xs[lid], xs[lid + 1], xs[lid + 2], xs[lid + 7]); // TODO: add offset to vars for testing
// auto res = f(xs[lid + 4], xs[lid + 5], xs[lid + 6], xs[lid + 7]); // TODO: add offset to vars for testing
// auto res = f(xs[lid + 3], xs[lid + 4], xs[lid + 10], xs[lid + 12]); // TODO: add offset to vars for testing

xs[rid].cv.ds[sid] = res.cv.ds[sid];
xs[rid].cc.ds[sid] = res.cc.ds[sid];
xs[rid].box.lb.ds[sid] = res.box.lb.ds[sid];
xs[rid].box.ub.ds[sid] = res.box.ub.ds[sid];

if (sid == 0) {
if (sid % N == 0) {
// put res.v into shared memory
xs[rid].cv.v = res.cv.v;
xs[rid].cc.v = res.cc.v;
xs[rid].box.lb.v = res.box.lb.v;
xs[rid].box.ub.v = res.box.ub.v;
}
printf("A [gid:%3d][bid:%3d][tid:%3d][rid:%3d][vid:%3d][xid:%3d][lid:%3d][sid:%3d] in.v: %g %g %g %g res is: %g %g %g %g %g\n",
gid, bid, tid, rid, vid, xid, lid, sid,
xs[lid].cv.v, xs[lid + 1].cv.v, xs[lid + 2].cv.v, xs[lid + 7].cv.v,
res.cv.v,
res.cv.ds[sid], res.cc.ds[sid], res.box.lb.ds[sid], res.box.ub.ds[sid]);
// TODO: maybe we can load it directly to global memory to save on shared memory space?
}

Expand All @@ -174,11 +190,88 @@ kernel(T *in, T *out, int n_elems, int n_vars)
for (int i = tid + out_block_start; i < out_block_end; i += n_threads) {
int sid = out_sh_mem_offset + i - out_block_start;
out[i] = ((double *)xs)[sid];
#if PRINT_DEBUG
#if 0 || PRINT_DEBUG
printf("[gid:%3d][bid:%3d][tid:%3d][bstart:%3d][bend:%3d] copy shared [%3d] (bank: [%3d]) into global [%3d] value: %g\n",
gid, bid, tid, out_block_start, out_block_end, sid, sid % 32, i, out[i]);
#endif
}

//
// Other tangent iterations (if n_tangents < n_vars)
//

#if 0
int tangent_offset = N;
int tangent_out_offset = n_elems * n_out_doubles_per_mc;

int k = 1;
// for (int k = 1; k < n_vars / N; k++) {
// printf("k is: %d\n", k);
// seed tangents
// TODO: unroll
for (int i = tid + t_block_start; i < t_block_end; i += n_threads) {
int v = i / n_doubles_per_mc;
#if PRINT_DEBUG
printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][xid:%3d][i:%3d][k:%d] n_elems_tangent: %3d t_block_start: %3d t_block_end: %3d\n",
gid, bid, tid, v, xid, i, k, n_elems_per_block_with_tangents, t_block_start, t_block_end);
#endif

int tangent_idx = (i % (N + 1)) + tangent_offset - 1; // tangent index for this thread, -1 is no tangent but a value to be skipped

if (tangent_idx < 0)
continue;

bool is_cv_or_cc = (i + tangent_offset) % n_doubles_per_mc < 2 * (N + 1); // 2 since we only seed cv and cc
((double *)xs)[i - t_block_start] = (v % n_vars == tangent_idx) && is_cv_or_cc ? 1.0 : 0.0;

#if 0 || PRINT_DEBUG
printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d][k:%d] tangent seed value: %g\n",
gid, bid, tid, v, tangent_idx, xid, i - t_block_start, k, ((double *)xs)[i - t_block_start]);
#endif
}

__syncthreads();

// Actual computation
// TODO: unroll
for (int i = tid + compute_block_start; i < compute_block_end; i += n_threads) {
int lid = xid * n_vars - bid * n_threads; // local variable id
int sid = (i - compute_block_start) % N; // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
int rid = compute_out_offset + lid / n_vars; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents

// auto res = f(xs[lid], xs[lid + 1], xs[lid + 2], xs[lid + 3]); // TODO: add offset to vars for testing
// auto res = f(xs[lid + 8], xs[lid + 9], xs[lid + 10], xs[lid + 11]);
auto res = f(xs[lid + 10], xs[lid + 11], xs[lid + 12], xs[lid + 13]);

printf("[gid:%3d][bid:%3d][tid:%3d][rid:%3d] res is : %g %g %g %g %g\n", gid, bid, tid, rid, res.cv.v, res.cv.ds[sid], res.cc.ds[sid], res.box.lb.ds[sid], res.box.ub.ds[sid]);

xs[rid].cv.ds[sid] = res.cv.ds[sid];
xs[rid].cc.ds[sid] = res.cc.ds[sid];
xs[rid].box.lb.ds[sid] = res.box.lb.ds[sid];
xs[rid].box.ub.ds[sid] = res.box.ub.ds[sid];
}

// update inside
// for (int i = tid + out_block_start; i < out_block_end; i += n_threads) {
// int sid = out_sh_mem_offset + i - out_block_start;
// int out_idx = i + tangent_offset;
// out[out_idx] = ((double *)xs)[sid];
// }

// update outside
for (int i = tid + out_block_start; i < out_block_end; i += n_threads) {
int sid = out_sh_mem_offset + i - out_block_start;
int out_idx = i + tangent_out_offset;
out[out_idx] = ((double *)xs)[sid]; // TODO: skip value in copy (already have it)

#if 1 || PRINT_DEBUG
printf("[gid:%3d][bid:%3d][tid:%3d][bstart:%3d][bend:%3d][k:%d] copy shared [%3d] (bank: [%3d]) into global [%3d] value: %g\n",
gid, bid, tid, out_block_start, out_block_end, k, sid, sid % 32, out_idx, out[out_idx]);
#endif
}

#endif
// }
}

/* we really have two scenarios where the GPU should be used differently.
Expand All @@ -197,13 +290,13 @@ we should first try to make use of all the blocks individually doing one mccormi

int main()
{
constexpr int n_elems = 14;
constexpr int n_vars = 16;
constexpr int n = n_elems * n_vars;
constexpr int n_elems = 40;
constexpr int n_vars = 16;
constexpr int n = n_elems * n_vars;
constexpr int n_copy_doubles_per_mc = 2 * (n_vars + 1); // the number of doubles to copy from device back to host per mccormick relaxation. Skips box derivatives. Take cv, cc, lb, ub, cv.ds, cc.ds
// constexpr int n_copy_doubles_per_mc = 4 + 4 * n_vars; // the number of doubles to copy from device back to host per mccormick relaxation. Skips box derivatives. Take cv, cc, lb, ub, cv.ds, cc.ds

constexpr int n_blocks = 256;
constexpr int n_blocks = 10;
constexpr int n_threads = 256;

constexpr int n_tangents = 16; // the number of tangents to perform per mccormick relaxation, a multiple of 32 is ideal
Expand Down Expand Up @@ -250,7 +343,7 @@ int main()
CUDA_CHECK(cudaMemcpy(res, d_res, (n_elems * n_copy_doubles_per_mc) * sizeof(*d_res), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());

#if 0 || PRINT_DEBUG
#if 1 || PRINT_DEBUG
for (auto r : res) {
std::cout << r << std::endl;
}
Expand Down

0 comments on commit ac300be

Please sign in to comment.