From 489c073d271acfe6449b8a5dfadf4e602a5b9421 Mon Sep 17 00:00:00 2001
From: Neil Kichler <neil.kichler@gmail.com>
Date: Wed, 16 Oct 2024 17:48:45 +0200
Subject: [PATCH] further failed attempts to have shared memory work beyond 1
 tangent at the same time

---
 examples/mccormick/mccormick_v.cu            | 309 ++++++++++++++++++-
 include/cutangent/arithmetic/intrinsic_v.cuh |  62 +++-
 2 files changed, 352 insertions(+), 19 deletions(-)

diff --git a/examples/mccormick/mccormick_v.cu b/examples/mccormick/mccormick_v.cu
index d553b5c..a2b181e 100644
--- a/examples/mccormick/mccormick_v.cu
+++ b/examples/mccormick/mccormick_v.cu
@@ -15,7 +15,7 @@
 
 using cu::tangents;
 
-#define PRINT_DEBUG 0
+#define PRINT_DEBUG 1
 
 #define USE_VECTOR_LOAD_128 0
 #define USE_VECTOR_LOAD_256 0
@@ -23,7 +23,7 @@ using cu::tangents;
 constexpr __device__ auto f(const auto &x, const auto &y, const auto &z, const auto &w)
 {
     auto print = [](auto &x) {
-#if 1 || PRINT_DEBUG
+#if 1 && PRINT_DEBUG
         printf("[gid:%3d][bid:%3d][tid:%3d] f {v {%g, %g, %g, %g}, cv [%g, %g, %g, %g] cc [%g, %g, %g, %g] lb [%g, %g, %g, %g] ub [%g, %g, %g, %g]}\n",
                threadIdx.x + blockIdx.x * blockDim.x, blockIdx.x, threadIdx.x,
                x.cv.v,
@@ -43,7 +43,7 @@ constexpr __device__ auto f(const auto &x, const auto &y, const auto &z, const a
     // auto a = x + y;
     // cu::mccormick<tangents<double, N>> a;
 
-    print(x);
+    // print(x);
     // print(y);
     auto a = x + y + z + w;
     // auto b = a + a + a + a;
@@ -89,7 +89,8 @@ kernel(T *in, T *out, int n_elems, int n_vars)
     // TODO: unroll
     for (int i = tid + t_block_start; i < t_block_end; i += n_threads) {
         int v = i / n_doubles_per_mc;
-#if PRINT_DEBUG
+#if 0 && PRINT_DEBUG
+        printf("[gid:%3d][bid:%3d][tid:%3d][t_bstart:%d] i: %3d\n", gid, bid, tid, t_block_start, i);
         printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][xid:%3d][i:%3d] n_elems_tangent: %3d t_block_start: %3d t_block_end: %3d\n",
                gid, bid, tid, v, xid, i, n_elems_per_block_with_tangents, t_block_start, t_block_end);
 #endif
@@ -99,14 +100,17 @@ kernel(T *in, T *out, int n_elems, int n_vars)
         bool is_cv_or_cc                  = i % n_doubles_per_mc < 2 * (N + 1); // 2 since we only seed cv and cc
         ((double *)xs)[i - t_block_start] = (v % n_vars == tangent_idx) && is_cv_or_cc ? 1.0 : 0.0;
 
-#if PRINT_DEBUG
-        // printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d] tangent seed value: %g\n",
-        //        gid, bid, tid, v, tangent_idx, xid, i - t_block_start, ((double *)xs)[i - t_block_start]);
+#if 0 && PRINT_DEBUG
+        printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d][t_bstart:%3d] tangent seed value: %g\n",
+               gid, bid, tid, v, tangent_idx, xid, i - t_block_start, t_block_start, ((double *)xs)[i - t_block_start]);
 #endif
     }
 
     __syncthreads();
 
+#if 1
+
+
     // Load elements from global memory into shared memory trying to get a balanced allocation in all blocks
 #if USE_VECTOR_LOAD_128
     for (int i = tid * 2 + block_start; i + 1 < block_end; i += n_threads) {
@@ -190,12 +194,14 @@ kernel(T *in, T *out, int n_elems, int n_vars)
     for (int i = tid + out_block_start; i < out_block_end; i += n_threads) {
         int sid = out_sh_mem_offset + i - out_block_start;
         out[i]  = ((double *)xs)[sid];
-#if 0 || PRINT_DEBUG
+#if 0 && PRINT_DEBUG
         printf("[gid:%3d][bid:%3d][tid:%3d][bstart:%3d][bend:%3d] copy shared [%3d] (bank: [%3d]) into global [%3d] value: %g\n",
                gid, bid, tid, out_block_start, out_block_end, sid, sid % 32, i, out[i]);
 #endif
     }
 
+#endif
+
     //
     // Other tangent iterations (if n_tangents < n_vars)
     //
@@ -264,7 +270,7 @@ kernel(T *in, T *out, int n_elems, int n_vars)
         int out_idx  = i + tangent_out_offset;
         out[out_idx] = ((double *)xs)[sid]; // TODO: skip value in copy (already have it)
 
-#if 1 || PRINT_DEBUG
+#if 0 && PRINT_DEBUG
         printf("[gid:%3d][bid:%3d][tid:%3d][bstart:%3d][bend:%3d][k:%d] copy shared [%3d] (bank: [%3d]) into global [%3d] value: %g\n",
                gid, bid, tid, out_block_start, out_block_end, k, sid, sid % 32, out_idx, out[out_idx]);
 #endif
@@ -288,9 +294,281 @@ kernel(T *in, T *out, int n_elems, int n_vars)
 we should first try to make use of all the blocks individually doing one mccormick computation (1a).
 */
 
+// struct range
+// {
+//     int begin;
+//     int end;
+// };
+//
+// constexpr range value_range(int bid, int n_elems, int n_blocks, int n_vars)
+// {
+//     int n = n_elems * n_vars;
+//
+//     // block range when considering only mccormick values (for initial copy from global memory)
+//     // int n_elems_per_block = (n_elems + n_blocks - 1) >> int(log2(n_blocks));
+//     int n_elems_per_block = (n_elems + n_blocks - 1) / n_blocks;
+//     int block_start       = n_elems_per_block * bid * 4 * n_vars;
+//     int block_end         = std::min(block_start + n_elems_per_block * 4 * n_vars, n * 4);
+//
+//     return { block_start, block_end };
+// };
+
+
+#if 0
+__device__ void debug_sync()
+{
+    __syncthreads();
+}
+
+__device__ void real_sync()
+{
+    __syncthreads();
+}
+extern __shared__ double xs[];
+
+template<typename T, int N>
+__shared__ tangents<T, N> *tangents_block_buffer;
+
+template<typename T, int N>
+__global__ void
+kernel(T *in, T *out, int n_elems, int n_vars)
+{
+    cu::mccormick<tangents<T, N>> *xs_mc = (cu::mccormick<tangents<T, N>> *)xs;
+
+    assert(N <= n_vars && "number of tangents (N) must be <= number of variables (n_vars)");
+    // might want to assert that N should be power of 2, and ideally multiple of 32
+
+    const int n                    = n_elems * n_vars;                      // total number of mccormick variables across all elements
+    const int gid                  = threadIdx.x + blockIdx.x * blockDim.x; // global id
+    const int bid                  = blockIdx.x;                            // block id
+    const int tid                  = threadIdx.x;                           // thread id inside block
+    const int n_threads            = blockDim.x;                            // number of threads in a block
+    const int n_blocks             = gridDim.x;                             // number of blocks in the grid  TODO: should probably be power of two for fast % operation
+    const int n_doubles_per_mc     = 4 * (N + 1);                           // 4 for cv, cc, lb, ub
+    const int n_out_doubles_per_mc = 2 * (N + 1);                           // 2 for cv, cc
+    const int xid                  = gid / n_vars;                          // mccormick id in xs
+
+    const int n_elems_per_block = (n_elems + n_blocks - 1) / n_blocks;
+
+    if (tid == 0) {
+        printf("[gid:%4d][bid:%3d][tid:%3d][xid:%3d] n_elems_per_block: %3d\n",
+               gid, bid, tid, xid, n_elems_per_block);
+    }
+
+    const int block_start = n_elems_per_block * bid * 4 * n_vars;
+    const int block_end   = min(block_start + n_elems_per_block * 4 * n_vars, n * 4);
+
+    if (tid == 0)
+        printf("[gid:%4d][bid:%3d][tid:%3d][xid:%3d] elems_per_block: %3d block_start: %4d block_end: %4d\n",
+               gid, bid, tid, xid, n_elems_per_block, block_start, block_end);
+
+    // block range when considering tangents as well
+    const int n_elems_per_block_with_tangents = n_elems_per_block * n_vars * n_doubles_per_mc;
+    const int t_block_start                   = n_elems_per_block_with_tangents * bid;
+    const int t_block_end                     = min(t_block_start + n_elems_per_block_with_tangents, n_elems * n_vars * n_doubles_per_mc);
+
+    if (tid == 0)
+        printf("[gid:%4d][bid:%3d][tid:%3d][xid:%3d] elems_per_block_w_tan: %3d t_block_start: %4d t_block_end: %4d\n",
+               gid, bid, tid, xid, n_elems_per_block_with_tangents, t_block_start, t_block_end);
+
+    debug_sync();
+
+    // All the configurations that need to be checked:
+    //
+    // 1. n_tangents == n_vars [x]
+    // 2. n_tangents < n_vars  [x]
+    // 3. n_tangents > n_vars  -> assert [x]
+    //
+    // - n_blocks < n_elems  [x]
+    // - n_blocks == n_elems [x]
+    // - n_blocks > n_elems  [x]
+    //
+    // - n_threads < n_vars  [x]
+    // - n_threads == n_vars [x]
+    // - n_threads > n_vars  [x]
+    int iter = 0;
+    for (int i = tid + t_block_start; i < t_block_end; i += n_threads) {
+        int v           = i / n_doubles_per_mc;
+        int tangent_idx = (i % (N + 1)) - 1;                                    // tangent index for this thread, -1 is no tangent but a value to be skipped
+                                                                                // faster alternative: int tangent_idx = x - floor(1/(N+1) * x) * (N+1) - 1; // TODO: check if compiler figures this out
+        bool is_cv_or_cc                  = i % n_doubles_per_mc < 2 * (N + 1); // 2 since we only seed cv and cc
+        ((double *)xs)[i - t_block_start] = (v % n_vars == tangent_idx) && is_cv_or_cc ? 1.0 : 0.0;
+
+        // if (bid == 0 && iter == 0) {
+        // if (bid == 0) {
+        //     printf("[gid:%3d][bid:%3d][tid:%3d][vid:%3d][tangent_idx:%3d][xid:%3d][i:%3d][t_bstart:%3d][iter:%3d] tangent seed value: %g\n",
+        //            gid, bid, tid, v, tangent_idx, xid, i - t_block_start, t_block_start, iter, ((double *)xs)[i - t_block_start]);
+        // }
+
+        iter++;
+    }
+
+    real_sync();
+
+    // All the configurations that need to be checked:
+    //
+    // 1. n_tangents == n_vars [x]
+    // 2. n_tangents < n_vars  [x]
+    // 3. n_tangents > n_vars  -> assert [x]
+    //
+    // - n_blocks < n_elems  [x]
+    // - n_blocks == n_elems [x]
+    // - n_blocks > n_elems  [x]
+    //
+    // - n_threads < n_vars  [x]
+    // - n_threads == n_vars [x]
+    // - n_threads > n_vars  [x]
+
+    // Load elements from global memory into shared memory trying to get a balanced allocation in all blocks
+#if USE_VECTOR_LOAD_128
+    for (int i = tid * 2 + block_start; i + 1 < block_end; i += n_threads) {
+        int sid                       = (i - block_start) * (N + 1);
+        double2 tmp                   = *(double2 *)&in[i]; // init value
+        ((double *)xs)[sid]           = tmp.x;
+        ((double *)xs)[sid + (N + 1)] = tmp.y;
+        // printf("[gid:%3d][bid:%3d][tid:%3d][i:%3d] init value: %g\n", gid, bid, tid, i, in[i]);
+    }
+#elif USE_VECTOR_LOAD_256
+    for (int i = tid * 4 + block_start; i + 3 < block_end; i += n_threads) {
+        int sid                           = (i - block_start) * (N + 1);
+        double4 tmp                       = *(double4 *)&in[i]; // init value
+        ((double *)xs)[sid]               = tmp.x;
+        ((double *)xs)[sid + (N + 1)]     = tmp.y;
+        ((double *)xs)[sid + 2 * (N + 1)] = tmp.z;
+        ((double *)xs)[sid + 3 * (N + 1)] = tmp.w;
+        // printf("[gid:%3d][bid:%3d][tid:%3d][i:%3d] init value: %g\n", gid, bid, tid, i, in[i]);
+    }
+#else
+    for (int i = tid + block_start; i < block_end; i += n_threads) {
+        int sid             = (i - block_start) * (N + 1);
+        ((double *)xs)[sid] = in[i]; // init value
+        // printf("[gid:%3d][bid:%3d][tid:%3d][sid:%4d][i:%3d] init value: %g\n", gid, bid, tid, sid, i, in[i]);
+    }
+#endif
+
+    real_sync();
+
+    // Actual computation
+    int compute_out_offset_idx = n_elems_per_block * n_vars; // index offset into shared memory
+    int compute_block_start    = n_elems_per_block * bid * N;
+    int compute_block_end      = min(compute_block_start + n_elems_per_block * N, n_elems * N);
+
+    // tangents_block_buffer<T, N> = &((tangents<T, N> *)xs)[compute_out_offset_idx];
+    //
+    // if (tid == 0) {
+    //     printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] tangents_block_buffer: %p, el: %g\n", gid, bid, tid, xid, tangents_block_buffer<T, N>, tangents_block_buffer<T, N>[0].v);
+    //     printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] xs: %p, xs[0]: %g\n", gid, bid, tid, xid, xs, xs[tid]);
+    // }
+
+#if 1
+    iter = 0;
+    // TODO: unroll
+    // for (int i = tid + block_start; i < block_end; i += n_threads) {
+    for (int i = tid + compute_block_start; i < compute_block_end; i += n_threads) {
+
+        int lid = xid * n_vars - bid * n_threads;        // local variable id offset
+        int sid = (i - t_block_start) % N;               // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
+        int rid = compute_out_offset_idx + lid / n_vars; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents
+        int vid = bid * n_elems_per_block + tid / N;
+
+        if (bid == 0) {
+            printf("[gid:%3d][bid:%3d][tid:%3d][lid:%3d][sid:%3d][rid:%3d][vid:%3d][i:%3d][iter:%3d] inputs (cv.v): %g %g %g %g\n",
+                   gid, bid, tid, lid, sid, rid, vid, i, iter, xs_mc[lid].cv.v, xs_mc[lid + 1].cv.v, xs_mc[lid + 2].cv.v, xs_mc[lid + 7].cv.v);
+        }
+
+        // if (bid > 0)
+        //     continue;
+
+        // TODO: this can be replaced by a single pointer input to &xs[lid]
+        //       in fact, here should reside a cuda device graph
+        auto res = f(xs_mc[lid], xs_mc[lid + 1], xs_mc[lid + 2], xs_mc[lid + 7]);
+
+        iter++;
+    }
+
+#elif 0
+    //
+    // if (tid == 0)
+    //     printf("[gid:%4d][bid:%3d][tid:%3d][xid:%3d] compute_out_offset_idx: %3d compute_block_start: %4d compute_block_end: %4d\n",
+    //            gid, bid, tid, xid, compute_out_offset_idx, compute_block_start, compute_block_end);
+
+    // All the configurations that need to be checked:
+    //
+    // 1. n_tangents == n_vars [ ]
+    // 2. n_tangents < n_vars  [ ]
+    // 3. n_tangents > n_vars  -> assert [x]
+    //
+    // - n_blocks < n_elems  [ ]
+    // - n_blocks == n_elems [ ]
+    // - n_blocks > n_elems  [ ]
+    //
+    // - n_threads < n_vars  [ ]
+    // - n_threads == n_vars [ ]
+    // - n_threads > n_vars  [ ]
+
+    iter = 0;
+    // TODO: unroll
+    // for (int i = tid + block_start; i < block_end; i += n_threads) {
+    for (int i = tid + compute_block_start; i < compute_block_end; i += n_threads) {
+
+        // TODO: lid is wrong
+        int lid = xid * n_vars - bid * n_threads; // local variable id
+        // int lid = bid * n_elems_per_block; // local variable id
+        int sid = (i - t_block_start) % N;               // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
+        int rid = compute_out_offset_idx + lid / n_vars; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents
+        // int rid = compute_out_offset_idx + lid; // result id in shared memory - offset exists to not overwrite inputs that might be used for different sets of seed tangents
+
+        // int vid = xid - bid * N;
+        // int vid = gid / n_threads + bid * n_elems_per_block;
+
+        // int lid = bid * n_elems_per_block + (tid / N) * n_vars; // local variable id
+        // int sid = (i - compute_block_start);         // shared memory tangent id (subtracting compute_blocK_start is not really needed since we are starting at a multiple of N)
+        int vid = bid * n_elems_per_block + tid / N;
+
+        if (bid == 0) {
+            printf("[gid:%3d][bid:%3d][tid:%3d][lid:%3d][sid:%3d][rid:%3d][vid:%3d][i:%3d][iter:%3d] inputs (cv.v): %g %g %g %g\n",
+                   gid, bid, tid, lid, sid, rid, vid, i, iter, xs[lid].cv.v, xs[lid + 1].cv.v, xs[lid + 2].cv.v, xs[lid + 7].cv.v);
+        }
+
+        // if (bid > 0)
+        //     continue;
+
+        // TODO: this can be replaced by a single pointer input to &xs[lid]
+        auto res = f(xs[lid], xs[lid + 1], xs[lid + 2], xs[lid + 7]);
+
+#if 1
+        xs[rid].cv.ds[sid]     = res.cv.ds[sid];
+        xs[rid].cc.ds[sid]     = res.cc.ds[sid];
+        xs[rid].box.lb.ds[sid] = res.box.lb.ds[sid];
+        xs[rid].box.ub.ds[sid] = res.box.ub.ds[sid];
+
+        // TODO: if we let every thread do the update, it might be a broadcast operation into shared memory
+        //       and thus the same speed without the if statement.
+        if (sid % N == 0) {
+            // put res.v into shared memory
+            xs[rid].cv.v     = res.cv.v;
+            xs[rid].cc.v     = res.cc.v;
+            xs[rid].box.lb.v = res.box.lb.v;
+            xs[rid].box.ub.v = res.box.ub.v;
+        }
+        if (bid == 0)
+            printf("A [gid:%3d][bid:%3d][tid:%3d][rid:%3d][vid:%3d][xid:%3d][lid:%3d][sid:%3d][iter:%3d] in.v: %g %g %g %g, res is: %g %g %g %g %g\n",
+                   gid, bid, tid, rid, vid, xid, lid, sid, iter,
+                   xs[lid].cv.v, xs[lid + 1].cv.v, xs[lid + 2].cv.v, xs[lid + 7].cv.v,
+                   res.cv.v,
+                   res.cv.ds[sid], res.cc.ds[sid], res.box.lb.ds[sid], res.box.ub.ds[sid]);
+#endif
+        iter++;
+    }
+#endif
+}
+
+#endif
+
 int main()
 {
-    constexpr int n_elems               = 40;
+    // constexpr int n_elems = 31;
+    constexpr int n_elems               = 8;
     constexpr int n_vars                = 16;
     constexpr int n                     = n_elems * n_vars;
     constexpr int n_copy_doubles_per_mc = 2 * (n_vars + 1); // the number of doubles to copy from device back to host per mccormick relaxation. Skips box derivatives. Take cv, cc, lb, ub, cv.ds, cc.ds
@@ -298,15 +576,18 @@ int main()
 
     constexpr int n_blocks  = 10;
     constexpr int n_threads = 256;
+    // constexpr int n_threads = n_vars;
+    // constexpr int n_threads = 8;
 
     constexpr int n_tangents = 16; // the number of tangents to perform per mccormick relaxation, a multiple of 32 is ideal
 
     constexpr int n_elems_per_block = std::ceil(double(n_elems) / n_blocks);
     constexpr int n_vars_per_block  = n_vars * n_elems_per_block; // the number of mccormick variables to access in shared memory per block
 
-    assert(n_mccormick >= n_vars && "n_mccormick must be >= n_vars");
     static_assert(n_tangents <= n_vars, "n_tangents must be <= n_vars");
 
+    static_assert(n_blocks >= n_elems, "n_blocks must be >= n_elems for now");
+
     cu::mccormick<double> xs[n_elems * n_vars] {};
     double res[n_elems * n_copy_doubles_per_mc] {};
 
@@ -318,7 +599,7 @@ int main()
         }
     }
 
-#if PRINT_DEBUG
+#if 0 && PRINT_DEBUG
     for (auto x : xs) {
         std::cout << "x is: " << x << std::endl;
     }
@@ -327,7 +608,7 @@ int main()
     constexpr int n_bytes_shared_in  = n_vars_per_block * 4 * sizeof(double) * (n_tangents + 1);
     constexpr int n_bytes_shared_out = n_elems_per_block * 4 * sizeof(double) * (n_tangents + 1);
     constexpr int n_bytes_shared     = n_bytes_shared_in + n_bytes_shared_out;
-    printf("n_bytes_shared = %d B\n", n_bytes_shared);
+    printf("n_bytes_shared = %g KiB\n", n_bytes_shared / 1024.0);
 
     double *d_xs;  // we only use a single double array for easier coalescing
     double *d_res; // same as above
@@ -343,7 +624,7 @@ int main()
     CUDA_CHECK(cudaMemcpy(res, d_res, (n_elems * n_copy_doubles_per_mc) * sizeof(*d_res), cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaDeviceSynchronize());
 
-#if 1 || PRINT_DEBUG
+#if 1 && PRINT_DEBUG
     for (auto r : res) {
         std::cout << r << std::endl;
     }
diff --git a/include/cutangent/arithmetic/intrinsic_v.cuh b/include/cutangent/arithmetic/intrinsic_v.cuh
index 03dae30..853e123 100644
--- a/include/cutangent/arithmetic/intrinsic_v.cuh
+++ b/include/cutangent/arithmetic/intrinsic_v.cuh
@@ -48,18 +48,51 @@ using cu::tangents;
 //
 //     template<> inline __device__ tangent<double> fma_down  (tangent<double> x, tangent<double> y, tangent<double> z) { return x * y + z; }
 //     template<> inline __device__ tangent<double> fma_up    (tangent<double> x, tangent<double> y, tangent<double> z) { return x * y + z; }
+
+
+#if 0
+extern __shared__ double xs[];
+
+template<typename T, int N>
+__shared__ tangents<T, N> *tangents_block_buffer;
+
 template<int N>
 inline __device__ tangents<double, N> add_down(const tangents<double, N> &x, const tangents<double, N> &y)
 {
-    __shared__ tangents<double, N> res;
-    res.v = add_down(x.v, y.v);
+    // __shared__ tangents<double, N> res[4];
+
+    printf("intrinsic tangents_block_buffer: %p, el: %g\n", 
+           tangents_block_buffer<double, N>, tangents_block_buffer<double, N>[0].v);
+
+    // if (tid == 0) {
+        // printf("[gid:%3d][bid:%3d][tid:%3d][xid:%3d] xs: %p, xs[0]: %g\n", gid, bid, tid, xid, xs, xs[tid]);
+    // }
+
+    // tangents<double, N> res = (tangents<double, N> *)xs;
 
     // multiple mccormick relaxations in one block
     int gid = blockIdx.x * blockDim.x + threadIdx.x;
-    for (int i = gid % N; i < N; i += blockDim.x * gridDim.x) {
-        // printf("[gid: %3d][bid: %3d][tid: %3d] i: %3d\n", gid, blockIdx.x, threadIdx.x, i);
-        res.ds[i] = add_down(x.ds[i], y.ds[i]);
+
+    int rid = gid / N;
+
+    #if 0
+    // if (gid < N) {
+    if (gid > N and gid < 2 * N) {
+
+        res[rid].v = add_down(x.v, y.v);
+
+        for (int i = gid % N; i < N; i += blockDim.x * gridDim.x) {
+            // if (threadIdx.x == 0) {
+                printf("[gid: %4d][bid: %3d][tid: %3d][N:%3d] x.cv.v %g, y.cv.v %g, res.cv.v: %g\n",
+                       gid, blockIdx.x, threadIdx.x, N, x.v, y.v, res.v);
+            // }
+
+
+            res[rid].ds[i] = add_down(x.ds[i], y.ds[i]);
+        }
+
     }
+    #endif
 
     // if (threadIdx.x == 0) {
     //     res.v = add_down(x.v, y.v);
@@ -78,9 +111,28 @@ inline __device__ tangents<double, N> add_down(const tangents<double, N> &x, con
     // for (int i = 0; i < N; i++) {
     //     res.ds[i] = add_down(x.ds[i], y.ds[i]);
     // }
+    return {};
+    // return res;
+}
+#else
+
+template<int N>
+inline __device__ tangents<double, N> add_down(const tangents<double, N> &a, const tangents<double, N> &b)
+{
+    __shared__ tangents<double, N> res;
+    res.v = add_down(a.v, b.v);
+
+    // multiple mccormick relaxations in one block
+    int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = gid % N; i < N; i += blockDim.x * gridDim.x) {
+        res.ds[i] = add_down(a.ds[i], b.ds[i]);
+    }
     return res;
 }
 
+
+#endif
+
 template<int N>
 inline __device__ tangents<double, N> add_up(const tangents<double, N> &a, const tangents<double, N> &b)
 {