current state

neilkichler · Oct 14, 2024 · ac300be · ac300be
1 parent 8b08806
commit ac300be
Showing 1 changed file with 114 additions and 21 deletions.
diff --git a/examples/mccormick/mccormick_v.cu b/examples/mccormick/mccormick_v.cu
@@ -23,7 +23,7 @@ using cu::tangents;
 constexpr __device__ auto f(const auto &x, const auto &y, const auto &z, const auto &w)
 {
     auto print = [](auto &x) {
-#if PRINT_DEBUG
+#if 1 || PRINT_DEBUG
         printf("[gid:%3d][bid:%3d][tid:%3d] f {v {%g, %g, %g, %g}, cv [%g, %g, %g, %g] cc [%g, %g, %g, %g] lb [%g, %g, %g, %g] ub [%g, %g, %g, %g]}\n",
                threadIdx.x + blockIdx.x * blockDim.x, blockIdx.x, threadIdx.x,
                x.cv.v,
@@ -44,19 +44,18 @@ constexpr __device__ auto f(const auto &x, const auto &y, const auto &z, const a
     // cu::mccormick<tangents<double, N>> a;
 
     print(x);
-    print(y);
+    // print(y);
     auto a = x + y + z + w;
-    auto b = a + a + a + a;
-    auto c = b + b + b;
-    auto d = c + c;
+    // auto b = a + a + a + a;
+    // auto c = b + b + b;
+    // auto d = c + c;
     // print(a);
 
-    return d;
+    return a;
 }
 
 template<typename T, int N>
 __global__ void
-// __launch_bounds__(256, 1)
 kernel(T *in, T *out, int n_elems, int n_vars)
 {
     extern __shared__ cu::mccormick<cu::tangents<T, N>> xs[];
@@ -82,9 +81,9 @@ kernel(T *in, T *out, int n_elems, int n_vars)
     int t_block_start                   = n_elems_per_block_with_tangents * bid;
     int t_block_end                     = min(t_block_start + n_elems_per_block_with_tangents, n_elems * n_vars * n_doubles_per_mc);
 
-    // if (tid == 0)
-    //     printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] elems_per_block: %3d block_start: %3d block_end: %3d\n",
-    //            gid, bid, tid, xid, n_elems_per_block, block_start, block_end);
+    if (tid == 0)
+        printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] elems_per_block: %3d block_start: %3d block_end: %3d\n",
+               gid, bid, tid, xid, n_elems_per_block, block_start, block_end);
 
     // seed tangents
     // TODO: unroll
@@ -101,8 +100,8 @@ kernel(T *in, T *out, int n_elems, int n_vars)
         ((double *)xs)[i - t_block_start] = (v % n_vars == tangent_idx) && is_cv_or_cc ? 1.0 : 0.0;
 
 #if PRINT_DEBUG
-        printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d] tangent seed value: %g\n",
-               gid, bid, tid, v, tangent_idx, xid, i - t_block_start, ((double *)xs)[i - t_block_start]);
+        // printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d] tangent seed value: %g\n",
+        //        gid, bid, tid, v, tangent_idx, xid, i - t_block_start, ((double *)xs)[i - t_block_start]);
 #endif
     }
 
@@ -143,24 +142,41 @@ kernel(T *in, T *out, int n_elems, int n_vars)
     int compute_block_end   = min(compute_block_start + n_elems_per_block * N, n_elems * N);
     // TODO: unroll
     for (int i = tid + compute_block_start; i < compute_block_end; i += n_threads) {
-        int lid = xid * n_vars - bid * n_threads;    // local variable id
+        // TODO: lid is wrong
+        int lid = xid * n_vars - bid * n_threads; // local variable id
+        // int lid = bid * n_elems_per_block; // local variable id
         int sid = (i - compute_block_start) % N;     // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
         int rid = compute_out_offset + lid / n_vars; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents
+        // int rid = compute_out_offset + lid; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents
+
+        // int vid = xid - bid * N;
+        // int vid = gid / n_threads + bid * n_elems_per_block;
+
+        // int lid = bid * n_elems_per_block + (tid / N) * n_vars; // local variable id
+        // int sid = (i - compute_block_start);         // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
+        int vid = bid * n_elems_per_block + tid / N;
 
-        auto res = f(xs[lid], xs[lid + 1], xs[lid + 2], xs[lid + 3]);
+        auto res = f(xs[lid], xs[lid + 1], xs[lid + 2], xs[lid + 7]); // TODO: add offset to vars for testing
+        // auto res = f(xs[lid + 4], xs[lid + 5], xs[lid + 6], xs[lid + 7]); // TODO: add offset to vars for testing
+        // auto res = f(xs[lid + 3], xs[lid + 4], xs[lid + 10], xs[lid + 12]); // TODO: add offset to vars for testing
 
         xs[rid].cv.ds[sid]     = res.cv.ds[sid];
         xs[rid].cc.ds[sid]     = res.cc.ds[sid];
         xs[rid].box.lb.ds[sid] = res.box.lb.ds[sid];
         xs[rid].box.ub.ds[sid] = res.box.ub.ds[sid];
 
-        if (sid == 0) {
+        if (sid % N == 0) {
             // put res.v into shared memory
             xs[rid].cv.v     = res.cv.v;
             xs[rid].cc.v     = res.cc.v;
             xs[rid].box.lb.v = res.box.lb.v;
             xs[rid].box.ub.v = res.box.ub.v;
         }
+        printf("A [gid:%3d][bid:%3d][tid:%3d][rid:%3d][vid:%3d][xid:%3d][lid:%3d][sid:%3d] in.v: %g %g %g %g res is: %g %g %g %g %g\n",
+               gid, bid, tid, rid, vid, xid, lid, sid,
+               xs[lid].cv.v, xs[lid + 1].cv.v, xs[lid + 2].cv.v, xs[lid + 7].cv.v,
+               res.cv.v,
+               res.cv.ds[sid], res.cc.ds[sid], res.box.lb.ds[sid], res.box.ub.ds[sid]);
         // TODO: maybe we can load it directly to global memory to save on shared memory space?
     }
 
@@ -174,11 +190,88 @@ kernel(T *in, T *out, int n_elems, int n_vars)
     for (int i = tid + out_block_start; i < out_block_end; i += n_threads) {
         int sid = out_sh_mem_offset + i - out_block_start;
         out[i]  = ((double *)xs)[sid];
-#if PRINT_DEBUG
+#if 0 || PRINT_DEBUG
         printf("[gid:%3d][bid:%3d][tid:%3d][bstart:%3d][bend:%3d] copy shared [%3d] (bank: [%3d]) into global [%3d] value: %g\n",
                gid, bid, tid, out_block_start, out_block_end, sid, sid % 32, i, out[i]);
 #endif
     }
+
+    //
+    // Other tangent iterations (if n_tangents < n_vars)
+    //
+
+#if 0
+    int tangent_offset     = N;
+    int tangent_out_offset = n_elems * n_out_doubles_per_mc;
+
+    int k = 1;
+    // for (int k = 1; k < n_vars / N; k++) {
+    //     printf("k is: %d\n", k);
+    // seed tangents
+    // TODO: unroll
+    for (int i = tid + t_block_start; i < t_block_end; i += n_threads) {
+        int v = i / n_doubles_per_mc;
+#if PRINT_DEBUG
+        printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][xid:%3d][i:%3d][k:%d] n_elems_tangent: %3d t_block_start: %3d t_block_end: %3d\n",
+               gid, bid, tid, v, xid, i, k, n_elems_per_block_with_tangents, t_block_start, t_block_end);
+#endif
+
+        int tangent_idx = (i % (N + 1)) + tangent_offset - 1; // tangent index for this thread, -1 is no tangent but a value to be skipped
+
+        if (tangent_idx < 0)
+            continue;
+
+        bool is_cv_or_cc                  = (i + tangent_offset) % n_doubles_per_mc < 2 * (N + 1); // 2 since we only seed cv and cc
+        ((double *)xs)[i - t_block_start] = (v % n_vars == tangent_idx) && is_cv_or_cc ? 1.0 : 0.0;
+
+#if 0 || PRINT_DEBUG
+        printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d][k:%d] tangent seed value: %g\n",
+               gid, bid, tid, v, tangent_idx, xid, i - t_block_start, k, ((double *)xs)[i - t_block_start]);
+#endif
+    }
+
+    __syncthreads();
+
+    // Actual computation
+    // TODO: unroll
+    for (int i = tid + compute_block_start; i < compute_block_end; i += n_threads) {
+        int lid = xid * n_vars - bid * n_threads;    // local variable id
+        int sid = (i - compute_block_start) % N;     // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
+        int rid = compute_out_offset + lid / n_vars; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents
+
+        // auto res = f(xs[lid], xs[lid + 1], xs[lid + 2], xs[lid + 3]); // TODO: add offset to vars for testing
+        // auto res = f(xs[lid + 8], xs[lid + 9], xs[lid + 10], xs[lid + 11]);
+        auto res = f(xs[lid + 10], xs[lid + 11], xs[lid + 12], xs[lid + 13]);
+
+        printf("[gid:%3d][bid:%3d][tid:%3d][rid:%3d] res is : %g %g %g %g %g\n", gid, bid, tid, rid, res.cv.v, res.cv.ds[sid], res.cc.ds[sid], res.box.lb.ds[sid], res.box.ub.ds[sid]);
+
+        xs[rid].cv.ds[sid]     = res.cv.ds[sid];
+        xs[rid].cc.ds[sid]     = res.cc.ds[sid];
+        xs[rid].box.lb.ds[sid] = res.box.lb.ds[sid];
+        xs[rid].box.ub.ds[sid] = res.box.ub.ds[sid];
+    }
+
+    // update inside
+    // for (int i = tid + out_block_start; i < out_block_end; i += n_threads) {
+    //     int sid      = out_sh_mem_offset + i - out_block_start;
+    //     int out_idx  = i + tangent_offset;
+    //     out[out_idx] = ((double *)xs)[sid];
+    // }
+
+    // update outside
+    for (int i = tid + out_block_start; i < out_block_end; i += n_threads) {
+        int sid      = out_sh_mem_offset + i - out_block_start;
+        int out_idx  = i + tangent_out_offset;
+        out[out_idx] = ((double *)xs)[sid]; // TODO: skip value in copy (already have it)
+
+#if 1 || PRINT_DEBUG
+        printf("[gid:%3d][bid:%3d][tid:%3d][bstart:%3d][bend:%3d][k:%d] copy shared [%3d] (bank: [%3d]) into global [%3d] value: %g\n",
+               gid, bid, tid, out_block_start, out_block_end, k, sid, sid % 32, out_idx, out[out_idx]);
+#endif
+    }
+
+#endif
+    // }
 }
 
 /* we really have two scenarios where the GPU should be used differently.
@@ -197,13 +290,13 @@ we should first try to make use of all the blocks individually doing one mccormi
 
 int main()
 {
-    constexpr int n_elems = 14;
-    constexpr int n_vars  = 16;
-    constexpr int n       = n_elems * n_vars;
+    constexpr int n_elems               = 40;
+    constexpr int n_vars                = 16;
+    constexpr int n                     = n_elems * n_vars;
     constexpr int n_copy_doubles_per_mc = 2 * (n_vars + 1); // the number of doubles to copy from device back to host per mccormick relaxation. Skips box derivatives. Take cv, cc, lb, ub, cv.ds, cc.ds
     // constexpr int n_copy_doubles_per_mc = 4 + 4 * n_vars; // the number of doubles to copy from device back to host per mccormick relaxation. Skips box derivatives. Take cv, cc, lb, ub, cv.ds, cc.ds
 
-    constexpr int n_blocks  = 256;
+    constexpr int n_blocks  = 10;
     constexpr int n_threads = 256;
 
     constexpr int n_tangents = 16; // the number of tangents to perform per mccormick relaxation, a multiple of 32 is ideal
@@ -250,7 +343,7 @@ int main()
     CUDA_CHECK(cudaMemcpy(res, d_res, (n_elems * n_copy_doubles_per_mc) * sizeof(*d_res), cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaDeviceSynchronize());
 
-#if 0 || PRINT_DEBUG
+#if 1 || PRINT_DEBUG
     for (auto r : res) {
         std::cout << r << std::endl;
     }