From 956ee4dc3bb7c78ca5df1a2b0ec0cf8d545689cc Mon Sep 17 00:00:00 2001
From: Hanting Zhang <winston@lurk-lab.com>
Date: Fri, 9 Feb 2024 19:06:41 +0000
Subject: [PATCH] wip debug

---
 msm/pippenger.cuh | 176 ++++++++++++++++++++++++++--------------------
 1 file changed, 101 insertions(+), 75 deletions(-)
diff --git a/msm/pippenger.cuh b/msm/pippenger.cuh
index 104da21..f1cb270 100644
--- a/msm/pippenger.cuh
+++ b/msm/pippenger.cuh
@@ -357,13 +357,26 @@ template <class bucket_t, class point_t, class affine_t, class scalar_t,
 class msm_t
 {
     const gpu_t &gpu;
+
+    // main data
+    bool owned;
+    affine_h *d_points;
+    scalar_t *d_scalars;
+    uint32_t *d_pidx;
     size_t npoints;
+    size_t nscalars;
+
+    // per setup constants
     uint32_t wbits, nwins;
+    uint32_t batch;
+    uint32_t stride;
+
+    // auxiliary space
+    char *d_total_blob;
     bucket_h *d_buckets;
-    affine_h *d_points;
-    scalar_t *d_scalars;
     vec2d_t<uint32_t> d_hist;
-    bool owned;
+    vec2d_t<uint2> d_temps;
+    vec2d_t<uint32_t> d_digits;
 
     template <typename T>
     using vec_t = slice_t<T>;
@@ -387,60 +400,49 @@ class msm_t
     }
 
 public:
-    msm_t(const affine_t points[], size_t np, bool owned,
-          size_t ffi_affine_sz = sizeof(affine_t), int device_id = -1)
-        : owned(owned), gpu(select_gpu(device_id)), d_points(nullptr), d_scalars(nullptr)
+    // Initialize the MSM by moving the points to the device
+    msm_t(const affine_t points[], size_t npoints, bool owned, int device_id = -1) : gpu(select_gpu(device_id))
     {
-        npoints = (np + WARP_SZ - 1) & ((size_t)0 - WARP_SZ);
-
-        wbits = 17;
-        if (npoints > 192)
-        {
-            wbits = std::min(lg2(npoints + npoints / 2) - 8, 18);
-            if (wbits < 10)
-                wbits = 10;
-        }
-        else if (npoints > 0)
-        {
-            wbits = 10;
-        }
-        nwins = (scalar_t::bit_length() - 1) / wbits + 1;
+        // set default values for fields
+        this->d_points = nullptr;
+        this->d_scalars = nullptr;
+        this->d_pidx = nullptr;
+        this->npoints = npoints;
+        this->owned = owned;
 
-        uint32_t row_sz = 1U << (wbits - 1);
+        this->d_total_blob = nullptr;
 
-        size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ);
-        size_t d_blob_sz = (d_buckets_sz * sizeof(d_buckets[0])) + (nwins * row_sz * sizeof(uint32_t));
+        d_points = reinterpret_cast<decltype(d_points)>(gpu.Dmalloc(npoints * sizeof(d_points[0])));
+        gpu.HtoD(d_points, points, npoints, sizeof(affine_h));
+        CUDA_OK(cudaGetLastError());
+    }
 
-        d_buckets = reinterpret_cast<decltype(d_buckets)>(gpu.Dmalloc(d_blob_sz));
-        d_hist = vec2d_t<uint32_t>(&d_buckets[d_buckets_sz], row_sz);
-        if (points)
-        {
-            d_points = reinterpret_cast<decltype(d_points)>(gpu.Dmalloc(points ? npoints * sizeof(d_points[0]) : 0));
-            gpu.HtoD(d_points, points, np, ffi_affine_sz);
-        }
+    msm_t(affine_h *d_points, size_t npoints, int device_id = -1) : gpu(select_gpu(device_id))
+    {
+        // set default values for fields
+        this->d_points = d_points;
+        this->d_scalars = nullptr;
+        this->d_pidx = nullptr;
+        this->npoints = npoints;
+        this->owned = false;
 
-        if (owned)
-            npoints = 0;
-        else
-            npoints = np;
+        this->d_total_blob = nullptr;
     }
-    inline msm_t(vec_t<affine_t> points, size_t ffi_affine_sz = sizeof(affine_t),
-                 int device_id = -1)
-        : msm_t(points, points.size(), ffi_affine_sz, device_id){};
-    inline msm_t(int device_id = -1)
-        : msm_t(nullptr, 0, 0, device_id){};
+
     ~msm_t()
     {
         gpu.sync();
-        if (d_buckets)
-            gpu.Dfree(d_buckets);
+        if (d_total_blob)
+            gpu.Dfree(d_total_blob);
         if (d_points && owned)
             gpu.Dfree(d_points);
     }
+
     affine_h *get_d_points()
     {
         return d_points;
     }
+
     void set_d_points(affine_h *d_points)
     {
         assert(!this->owned);
@@ -499,19 +501,67 @@ private:
     }
 
 public:
-    RustError invoke(point_t &out, const affine_t *points_, size_t npoints,
-                     const scalar_t *scalars, bool mont = true,
-                     size_t ffi_affine_sz = sizeof(affine_t))
+    // Compute various constants (stride length, window size) based on the number of scalars.
+    // Also allocate scratch space.
+    void setup_scratch(size_t npoints)
     {
-        assert(this->npoints == 0 || npoints <= this->npoints);
+        this->npoints = npoints;
+
+        uint32_t lg_n = lg2(npoints + npoints / 2);
+
+        wbits = 17;
+        if (npoints > 192)
+        {
+            wbits = std::min(lg_n, (uint32_t)18);
+            if (wbits < 10)
+                wbits = 10;
+        }
+        else if (npoints > 0)
+        {
+            wbits = 10;
+        }
+        nwins = (scalar_t::bit_length() - 1) / wbits + 1;
+
+        uint32_t row_sz = 1U << (wbits - 1);
 
-        uint32_t lg_npoints = lg2(npoints + npoints / 2);
-        size_t batch = 1 << (std::max(lg_npoints, wbits) - wbits);
+        size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ);
+        d_buckets_sz *= sizeof(d_buckets[0]);
+        size_t d_hist_sz = nwins * row_sz * sizeof(uint32_t);
+
+        this->batch = 1 << (std::max(lg_n, wbits) - wbits);
         batch >>= 6;
         batch = batch ? batch : 1;
-        uint32_t stride = (npoints + batch - 1) / batch;
+        this->stride = (npoints + batch - 1) / batch;
         stride = (stride + WARP_SZ - 1) & ((size_t)0 - WARP_SZ);
 
+        size_t temp_sz = stride * std::max(2 * sizeof(uint2), sizeof(scalar_t));
+        size_t digits_sz = nwins * stride * sizeof(uint32_t);
+        // size_t pidx_sz = pidx ? stride * sizeof(uint32_t) : 0;
+
+        size_t d_blob_sz = d_buckets_sz + d_hist_sz + temp_sz + digits_sz; // + pidx_sz;
+
+        d_total_blob = reinterpret_cast<char *>(gpu.Dmalloc(d_blob_sz));
+        size_t offset = 0;
+        d_buckets = reinterpret_cast<decltype(d_buckets)>(&d_total_blob[offset]);
+        offset += d_buckets_sz;
+        d_hist = vec2d_t<uint32_t>((uint32_t *)&d_total_blob[offset], row_sz);
+        offset += d_hist_sz;
+
+        d_temps = vec2d_t<uint2>((uint2 *)&d_total_blob[offset], stride);
+        d_scalars = (scalar_t *)&d_total_blob[offset];
+        offset += temp_sz;
+        d_digits = vec2d_t<uint32_t>((uint32_t *)&d_total_blob[offset], stride);
+        offset += digits_sz;
+        // if (pidx)
+        //     d_pidx = (uint32_t *)&d_total_blob[offset];
+    }
+
+    RustError invoke(point_t &out, const affine_t *points, size_t npoints,
+                     const scalar_t *scalars, bool mont = true,
+                     size_t ffi_affine_sz = sizeof(affine_t))
+    {
+        assert(this->npoints == 0 || npoints <= this->npoints);
+
         std::vector<result_t> res(nwins);
         std::vector<bucket_t> ones(gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ);
 
@@ -520,29 +570,6 @@ public:
 
         try
         {
-            // |scalars| being nullptr means the scalars are pre-loaded to
-            // |d_scalars|, otherwise allocate stride.
-            size_t temp_sz = scalars ? sizeof(scalar_t) : 0;
-            temp_sz = stride * std::max(2 * sizeof(uint2), temp_sz);
-
-            // |points| being nullptr means the points are pre-loaded to
-            // |d_points|, otherwise allocate double-stride.
-            const char *points = reinterpret_cast<const char *>(points_);
-            size_t d_point_sz = points ? (batch > 1 ? 2 * stride : stride) : 0;
-            d_point_sz *= sizeof(affine_h);
-
-            size_t digits_sz = nwins * stride * sizeof(uint32_t);
-
-            dev_ptr_t<uint8_t> d_temp{temp_sz + digits_sz + d_point_sz, gpu[2]};
-
-            vec2d_t<uint2> d_temps{&d_temp[0], stride};
-            vec2d_t<uint32_t> d_digits{&d_temp[temp_sz], stride};
-
-            scalar_t *d_scalars = scalars ? (scalar_t *)&d_temp[0]
-                                          : this->d_scalars;
-            affine_h *d_points = points ? (affine_h *)&d_temp[temp_sz + digits_sz]
-                                        : this->d_points;
-
             size_t d_off = 0; // device offset
             size_t h_off = 0; // host offset
             size_t num = stride > npoints ? npoints : stride;
@@ -823,8 +850,8 @@ static RustError mult_pippenger(point_t *out, const affine_t points[], size_t np
 {
     try
     {
-        msm_t<bucket_t, point_t, affine_t, scalar_t> msm{nullptr, npoints, true};
-        return msm.invoke(*out, slice_t<affine_t>{points, npoints},
+        msm_t<bucket_t, point_t, affine_t, scalar_t> msm{points, npoints, true};
+        return msm.invoke(*out, nullptr, npoints,
                           scalars, mont, ffi_affine_sz);
     }
     catch (const cuda_error &e)
@@ -847,8 +874,7 @@ static RustError mult_pippenger_with(point_t *out, msm_context_t<affine_h> *msm_
 {
     try
     {
-        msm_t<bucket_t, point_t, affine_t, scalar_t> msm{nullptr, npoints, false};
-        msm.set_d_points(msm_context->d_points);
+        msm_t<bucket_t, point_t, affine_t, scalar_t> msm{msm_context->d_points, npoints};
         return msm.invoke(*out, nullptr, npoints,
                           scalars, mont, ffi_affine_sz);
     }