From 956ee4dc3bb7c78ca5df1a2b0ec0cf8d545689cc Mon Sep 17 00:00:00 2001 From: Hanting Zhang Date: Fri, 9 Feb 2024 19:06:41 +0000 Subject: [PATCH] wip debug --- msm/pippenger.cuh | 176 ++++++++++++++++++++++++++-------------------- 1 file changed, 101 insertions(+), 75 deletions(-) diff --git a/msm/pippenger.cuh b/msm/pippenger.cuh index 104da21..f1cb270 100644 --- a/msm/pippenger.cuh +++ b/msm/pippenger.cuh @@ -357,13 +357,26 @@ template d_hist; - bool owned; + vec2d_t d_temps; + vec2d_t d_digits; template using vec_t = slice_t; @@ -387,60 +400,49 @@ class msm_t } public: - msm_t(const affine_t points[], size_t np, bool owned, - size_t ffi_affine_sz = sizeof(affine_t), int device_id = -1) - : owned(owned), gpu(select_gpu(device_id)), d_points(nullptr), d_scalars(nullptr) + // Initialize the MSM by moving the points to the device + msm_t(const affine_t points[], size_t npoints, bool owned, int device_id = -1) : gpu(select_gpu(device_id)) { - npoints = (np + WARP_SZ - 1) & ((size_t)0 - WARP_SZ); - - wbits = 17; - if (npoints > 192) - { - wbits = std::min(lg2(npoints + npoints / 2) - 8, 18); - if (wbits < 10) - wbits = 10; - } - else if (npoints > 0) - { - wbits = 10; - } - nwins = (scalar_t::bit_length() - 1) / wbits + 1; + // set default values for fields + this->d_points = nullptr; + this->d_scalars = nullptr; + this->d_pidx = nullptr; + this->npoints = npoints; + this->owned = owned; - uint32_t row_sz = 1U << (wbits - 1); + this->d_total_blob = nullptr; - size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); - size_t d_blob_sz = (d_buckets_sz * sizeof(d_buckets[0])) + (nwins * row_sz * sizeof(uint32_t)); + d_points = reinterpret_cast(gpu.Dmalloc(npoints * sizeof(d_points[0]))); + gpu.HtoD(d_points, points, npoints, sizeof(affine_h)); + CUDA_OK(cudaGetLastError()); + } - d_buckets = reinterpret_cast(gpu.Dmalloc(d_blob_sz)); - d_hist = vec2d_t(&d_buckets[d_buckets_sz], row_sz); - if (points) - { - d_points = reinterpret_cast(gpu.Dmalloc(points ? npoints * sizeof(d_points[0]) : 0)); - gpu.HtoD(d_points, points, np, ffi_affine_sz); - } + msm_t(affine_h *d_points, size_t npoints, int device_id = -1) : gpu(select_gpu(device_id)) + { + // set default values for fields + this->d_points = d_points; + this->d_scalars = nullptr; + this->d_pidx = nullptr; + this->npoints = npoints; + this->owned = false; - if (owned) - npoints = 0; - else - npoints = np; + this->d_total_blob = nullptr; } - inline msm_t(vec_t points, size_t ffi_affine_sz = sizeof(affine_t), - int device_id = -1) - : msm_t(points, points.size(), ffi_affine_sz, device_id){}; - inline msm_t(int device_id = -1) - : msm_t(nullptr, 0, 0, device_id){}; + ~msm_t() { gpu.sync(); - if (d_buckets) - gpu.Dfree(d_buckets); + if (d_total_blob) + gpu.Dfree(d_total_blob); if (d_points && owned) gpu.Dfree(d_points); } + affine_h *get_d_points() { return d_points; } + void set_d_points(affine_h *d_points) { assert(!this->owned); @@ -499,19 +501,67 @@ private: } public: - RustError invoke(point_t &out, const affine_t *points_, size_t npoints, - const scalar_t *scalars, bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) + // Compute various constants (stride length, window size) based on the number of scalars. + // Also allocate scratch space. + void setup_scratch(size_t npoints) { - assert(this->npoints == 0 || npoints <= this->npoints); + this->npoints = npoints; + + uint32_t lg_n = lg2(npoints + npoints / 2); + + wbits = 17; + if (npoints > 192) + { + wbits = std::min(lg_n, (uint32_t)18); + if (wbits < 10) + wbits = 10; + } + else if (npoints > 0) + { + wbits = 10; + } + nwins = (scalar_t::bit_length() - 1) / wbits + 1; + + uint32_t row_sz = 1U << (wbits - 1); - uint32_t lg_npoints = lg2(npoints + npoints / 2); - size_t batch = 1 << (std::max(lg_npoints, wbits) - wbits); + size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); + d_buckets_sz *= sizeof(d_buckets[0]); + size_t d_hist_sz = nwins * row_sz * sizeof(uint32_t); + + this->batch = 1 << (std::max(lg_n, wbits) - wbits); batch >>= 6; batch = batch ? batch : 1; - uint32_t stride = (npoints + batch - 1) / batch; + this->stride = (npoints + batch - 1) / batch; stride = (stride + WARP_SZ - 1) & ((size_t)0 - WARP_SZ); + size_t temp_sz = stride * std::max(2 * sizeof(uint2), sizeof(scalar_t)); + size_t digits_sz = nwins * stride * sizeof(uint32_t); + // size_t pidx_sz = pidx ? stride * sizeof(uint32_t) : 0; + + size_t d_blob_sz = d_buckets_sz + d_hist_sz + temp_sz + digits_sz; // + pidx_sz; + + d_total_blob = reinterpret_cast(gpu.Dmalloc(d_blob_sz)); + size_t offset = 0; + d_buckets = reinterpret_cast(&d_total_blob[offset]); + offset += d_buckets_sz; + d_hist = vec2d_t((uint32_t *)&d_total_blob[offset], row_sz); + offset += d_hist_sz; + + d_temps = vec2d_t((uint2 *)&d_total_blob[offset], stride); + d_scalars = (scalar_t *)&d_total_blob[offset]; + offset += temp_sz; + d_digits = vec2d_t((uint32_t *)&d_total_blob[offset], stride); + offset += digits_sz; + // if (pidx) + // d_pidx = (uint32_t *)&d_total_blob[offset]; + } + + RustError invoke(point_t &out, const affine_t *points, size_t npoints, + const scalar_t *scalars, bool mont = true, + size_t ffi_affine_sz = sizeof(affine_t)) + { + assert(this->npoints == 0 || npoints <= this->npoints); + std::vector res(nwins); std::vector ones(gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); @@ -520,29 +570,6 @@ public: try { - // |scalars| being nullptr means the scalars are pre-loaded to - // |d_scalars|, otherwise allocate stride. - size_t temp_sz = scalars ? sizeof(scalar_t) : 0; - temp_sz = stride * std::max(2 * sizeof(uint2), temp_sz); - - // |points| being nullptr means the points are pre-loaded to - // |d_points|, otherwise allocate double-stride. - const char *points = reinterpret_cast(points_); - size_t d_point_sz = points ? (batch > 1 ? 2 * stride : stride) : 0; - d_point_sz *= sizeof(affine_h); - - size_t digits_sz = nwins * stride * sizeof(uint32_t); - - dev_ptr_t d_temp{temp_sz + digits_sz + d_point_sz, gpu[2]}; - - vec2d_t d_temps{&d_temp[0], stride}; - vec2d_t d_digits{&d_temp[temp_sz], stride}; - - scalar_t *d_scalars = scalars ? (scalar_t *)&d_temp[0] - : this->d_scalars; - affine_h *d_points = points ? (affine_h *)&d_temp[temp_sz + digits_sz] - : this->d_points; - size_t d_off = 0; // device offset size_t h_off = 0; // host offset size_t num = stride > npoints ? npoints : stride; @@ -823,8 +850,8 @@ static RustError mult_pippenger(point_t *out, const affine_t points[], size_t np { try { - msm_t msm{nullptr, npoints, true}; - return msm.invoke(*out, slice_t{points, npoints}, + msm_t msm{points, npoints, true}; + return msm.invoke(*out, nullptr, npoints, scalars, mont, ffi_affine_sz); } catch (const cuda_error &e) @@ -847,8 +874,7 @@ static RustError mult_pippenger_with(point_t *out, msm_context_t *msm_ { try { - msm_t msm{nullptr, npoints, false}; - msm.set_d_points(msm_context->d_points); + msm_t msm{msm_context->d_points, npoints}; return msm.invoke(*out, nullptr, npoints, scalars, mont, ffi_affine_sz); }