diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..ed5e6cc --- /dev/null +++ b/.clang-format @@ -0,0 +1,3 @@ +BasedOnStyle: LLVM +IndentWidth: 4 +ColumnLimit: 120 diff --git a/msm/pippenger.cuh b/msm/pippenger.cuh index 9b6aaed..fdbd6fa 100644 --- a/msm/pippenger.cuh +++ b/msm/pippenger.cuh @@ -5,15 +5,15 @@ #ifndef __SPPARK_MSM_PIPPENGER_CUH__ #define __SPPARK_MSM_PIPPENGER_CUH__ -#include -#include #include +#include +#include -#include #include +#include -#include "sort.cuh" #include "batch_addition.cuh" +#include "sort.cuh" #ifndef WARP_SZ #define WARP_SZ 32 @@ -30,19 +30,13 @@ #ifdef __CUDA_ARCH__ // Transposed scalar_t -template -class scalar_T -{ +template class scalar_T { uint32_t val[sizeof(scalar_t) / sizeof(uint32_t)][WARP_SZ]; -public: + public: __device__ const uint32_t &operator[](size_t i) const { return val[i][0]; } - __device__ scalar_T &operator()(uint32_t laneid) - { - return *reinterpret_cast(&val[0][laneid]); - } - __device__ scalar_T &operator=(const scalar_t &rhs) - { + __device__ scalar_T &operator()(uint32_t laneid) { return *reinterpret_cast(&val[0][laneid]); } + __device__ scalar_T &operator=(const scalar_t &rhs) { for (size_t i = 0; i < sizeof(scalar_t) / sizeof(uint32_t); i++) val[i][0] = rhs[i]; return *this; @@ -51,8 +45,7 @@ public: template __device__ __forceinline__ static uint32_t get_wval(const scalar_T &scalar, uint32_t off, - uint32_t top_i = (scalar_t::nbits + 31) / 32 - 1) -{ + uint32_t top_i = (scalar_t::nbits + 31) / 32 - 1) { uint32_t i = off / 32; uint64_t ret = scalar[i]; @@ -62,8 +55,7 @@ __device__ __forceinline__ static uint32_t get_wval(const scalar_T &sc return ret >> (off % 32); } -__device__ __forceinline__ static uint32_t booth_encode(uint32_t wval, uint32_t wmask, uint32_t wbits) -{ +__device__ __forceinline__ static uint32_t booth_encode(uint32_t wval, uint32_t wmask, uint32_t wbits) { uint32_t sign = (wval >> wbits) & 1; wval = ((wval + 1) & wmask) >> 1; return sign ? 0 - wval : wval; @@ -71,10 +63,8 @@ __device__ __forceinline__ static uint32_t booth_encode(uint32_t wval, uint32_t #endif template -__launch_bounds__(1024) __global__ - void breakdown(vec2d_t digits, const scalar_t scalars[], size_t len, - uint32_t nwins, uint32_t wbits, bool mont = true) -{ +__launch_bounds__(1024) __global__ void breakdown(vec2d_t digits, const scalar_t scalars[], size_t len, + uint32_t nwins, uint32_t wbits, bool mont = true) { assert(len <= (1U << 31) && wbits < 32); #ifdef __CUDA_ARCH__ @@ -88,8 +78,7 @@ __launch_bounds__(1024) __global__ auto &scalar = xchange[tid / WARP_SZ](tid % WARP_SZ); #pragma unroll 1 - for (uint32_t i = tix; i < (uint32_t)len; i += gridDim.x * blockDim.x) - { + for (uint32_t i = tix; i < (uint32_t)len; i += gridDim.x * blockDim.x) { auto s = scalars[i]; #if 0 @@ -108,8 +97,7 @@ __launch_bounds__(1024) __global__ scalar = s; #pragma unroll 1 - for (uint32_t bit0 = nwins * wbits - 1, win = nwins; --win;) - { + for (uint32_t bit0 = nwins * wbits - 1, win = nwins; --win;) { bit0 -= wbits; uint32_t wval = get_wval(scalar, bit0, top_i); wval = booth_encode(wval, wmask, wbits); @@ -149,15 +137,12 @@ __launch_bounds__(1024) __global__ #error "invalid MSM_NSTREAMS" #endif -template __launch_bounds__(ACCUMULATE_NTHREADS) __global__ void accumulate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, - /*const*/ affine_h points_[], const vec2d_t digits, - const vec2d_t histogram, uint32_t sid = 0) -{ + /*const*/ affine_h points_[], const vec2d_t digits, const vec2d_t histogram, + uint32_t sid = 0) { vec2d_t buckets{buckets_, 1U << --wbits}; const affine_h *points = points_; @@ -182,8 +167,7 @@ __launch_bounds__(ACCUMULATE_NTHREADS) __global__ x = __shfl_sync(0xffffffff, x, 0) + lane_id; #endif - while (x < (nwins << wbits)) - { + while (x < (nwins << wbits)) { y = x >> wbits; x &= (1U << wbits) - 1; const uint32_t *h = &histogram[y][x]; @@ -193,13 +177,14 @@ __launch_bounds__(ACCUMULATE_NTHREADS) __global__ asm("{ .reg.pred %did;" " shfl.sync.up.b32 %0|%did, %1, %2, 0, 0xffffffff;" " @!%did mov.b32 %0, 0;" - "}" : "=r"(idx) : "r"(len), "r"(degree)); + "}" + : "=r"(idx) + : "r"(len), "r"(degree)); if (lane_id == 0 && x != 0) idx = h[-1]; - if ((len -= idx) && !(x == 0 && y == 0)) - { + if ((len -= idx) && !(x == 0 && y == 0)) { const uint32_t *digs_ptr = &digits[y][idx]; uint32_t digit = *digs_ptr++; @@ -207,8 +192,7 @@ __launch_bounds__(ACCUMULATE_NTHREADS) __global__ bucket_t bucket = p; bucket.cneg(digit >> 31); - while (--len) - { + while (--len) { digit = *digs_ptr++; p = points[digit & 0x7fffffff]; if (sizeof(bucket) <= 128 || LARGE_L1_CODE_CACHE) @@ -218,9 +202,7 @@ __launch_bounds__(ACCUMULATE_NTHREADS) __global__ } buckets[y][x] = bucket; - } - else - { + } else { buckets[y][x].inf(); } @@ -235,9 +217,7 @@ __launch_bounds__(ACCUMULATE_NTHREADS) __global__ } template -__launch_bounds__(256) __global__ - void integrate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, uint32_t nbits) -{ +__launch_bounds__(256) __global__ void integrate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, uint32_t nbits) { const uint32_t degree = bucket_t::degree; uint32_t Nthrbits = 31 - __clz(blockDim.x / degree); @@ -254,23 +234,19 @@ __launch_bounds__(256) __global__ row += tid * i; uint32_t mask = 0; - if ((bid + 1) * wbits > nbits) - { + if ((bid + 1) * wbits > nbits) { uint32_t lsbits = nbits - bid * wbits; mask = (1U << (wbits - lsbits)) - 1; } bucket_t res, acc = row[--i]; - if (i & mask) - { + if (i & mask) { if (sizeof(res) <= 128) res.inf(); else scratch[tid].inf(); - } - else - { + } else { if (sizeof(res) <= 128) res = acc; else @@ -280,40 +256,29 @@ __launch_bounds__(256) __global__ bucket_t p; #pragma unroll 1 - while (i--) - { + while (i--) { p = row[i]; uint32_t pc = i & mask ? 2 : 0; #pragma unroll 1 - do - { - if (sizeof(bucket_t) <= 128) - { + do { + if (sizeof(bucket_t) <= 128) { p.add(acc); - if (pc == 1) - { + if (pc == 1) { res = p; - } - else - { + } else { acc = p; if (pc == 0) p = res; } - } - else - { + } else { if (LARGE_L1_CODE_CACHE && degree == 1) p.add(acc); else p.uadd(acc); - if (pc == 1) - { + if (pc == 1) { scratch[tid] = p; - } - else - { + } else { acc = p; if (pc == 0) p = scratch[tid]; @@ -330,129 +295,111 @@ __launch_bounds__(256) __global__ #undef asm #ifndef SPPARK_DONT_INSTANTIATE_TEMPLATES -template __global__ void accumulate(bucket_t::mem_t buckets_[], - uint32_t nwins, uint32_t wbits, +template __global__ void accumulate(bucket_t::mem_t buckets_[], uint32_t nwins, + uint32_t wbits, /*const*/ affine_t::mem_t points_[], const vec2d_t digits, - const vec2d_t histogram, - uint32_t sid); -template __global__ void batch_addition(bucket_t::mem_t buckets[], - const affine_t::mem_t points[], size_t npoints, - const uint32_t digits[], const uint32_t &ndigits); -template __global__ void integrate(bucket_t::mem_t buckets_[], uint32_t nwins, - uint32_t wbits, uint32_t nbits); -template __global__ void breakdown(vec2d_t digits, const scalar_t scalars[], - size_t len, uint32_t nwins, uint32_t wbits, bool mont); + const vec2d_t histogram, uint32_t sid); +template __global__ void batch_addition(bucket_t::mem_t buckets[], const affine_t::mem_t points[], + size_t npoints, const uint32_t digits[], const uint32_t &ndigits); +template __global__ void integrate(bucket_t::mem_t buckets_[], uint32_t nwins, uint32_t wbits, + uint32_t nbits); +template __global__ void breakdown(vec2d_t digits, const scalar_t scalars[], size_t len, + uint32_t nwins, uint32_t wbits, bool mont); #endif #include #include -#include #include +#include -template -class msm_t -{ +class msm_t { const gpu_t &gpu; - size_t npoints; - uint32_t wbits, nwins; - bucket_h *d_buckets; + + // main data + bool owned; affine_h *d_points; scalar_t *d_scalars; uint32_t *d_pidx; + size_t npoints; + size_t nscalars; + + // per setup constants + uint32_t wbits, nwins; + uint32_t batch; + uint32_t stride; + + // auxiliary space + char *d_total_blob; + bucket_h *d_buckets; vec2d_t d_hist; - bool owned; + vec2d_t d_temps; + vec2d_t d_digits; - template - using vec_t = slice_t; + template using vec_t = slice_t; - class result_t - { + class result_t { bucket_t ret[MSM_NTHREADS / bucket_t::degree][2]; - public: + public: result_t() {} - inline operator decltype(ret) & () { return ret; } + inline operator decltype(ret) &() { return ret; } inline const bucket_t *operator[](size_t i) const { return ret[i]; } }; - constexpr static int lg2(size_t n) - { + constexpr static int lg2(size_t n) { int ret = 0; while (n >>= 1) ret++; return ret; } -public: - msm_t(const affine_t points[], size_t np, bool owned, - size_t ffi_affine_sz = sizeof(affine_t), int device_id = -1) - : owned(owned), gpu(select_gpu(device_id)), d_points(nullptr), d_scalars(nullptr), d_pidx(nullptr) - { - npoints = (np + WARP_SZ - 1) & ((size_t)0 - WARP_SZ); - - wbits = 17; - if (npoints > 192) - { - wbits = std::min(lg2(npoints + npoints / 2) - 8, 18); - if (wbits < 10) - wbits = 10; - } - else if (npoints > 0) - { - wbits = 10; - } - nwins = (scalar_t::bit_length() - 1) / wbits + 1; - - uint32_t row_sz = 1U << (wbits - 1); - - size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); - size_t d_blob_sz = (d_buckets_sz * sizeof(d_buckets[0])) + (nwins * row_sz * sizeof(uint32_t)); - - d_buckets = reinterpret_cast(gpu.Dmalloc(d_blob_sz)); - d_hist = vec2d_t(&d_buckets[d_buckets_sz], row_sz); - if (points) - { - d_points = reinterpret_cast(gpu.Dmalloc(points ? npoints * sizeof(d_points[0]) : 0)); - gpu.HtoD(d_points, points, np, ffi_affine_sz); - } + public: + // Initialize the MSM by moving the points to the device + msm_t(const affine_t points[], size_t npoints, int device_id = -1) + : gpu(select_gpu(device_id)) { + // set default values for fields + this->d_points = nullptr; + this->d_scalars = nullptr; + this->d_pidx = nullptr; + this->npoints = npoints; + this->owned = true; + + d_points = reinterpret_cast(gpu.Dmalloc(npoints * sizeof(d_points[0]))); + gpu.HtoD(d_points, points, npoints, sizeof(affine_t)); + CUDA_OK(cudaGetLastError()); + } - if (owned) - npoints = 0; - else - npoints = np; + msm_t(affine_h *d_points, size_t npoints, int device_id = -1) + : gpu(select_gpu(device_id)) { + // set default values for fields + this->d_points = d_points; + this->d_scalars = nullptr; + this->d_pidx = nullptr; + this->npoints = npoints; + this->owned = false; } - inline msm_t(vec_t points, size_t ffi_affine_sz = sizeof(affine_t), - int device_id = -1) - : msm_t(points, points.size(), ffi_affine_sz, device_id){}; - inline msm_t(int device_id = -1) - : msm_t(nullptr, 0, 0, device_id){}; - ~msm_t() - { + + ~msm_t() { gpu.sync(); - if (d_buckets) - gpu.Dfree(d_buckets); + if (d_total_blob) + gpu.Dfree(d_total_blob); if (d_points && owned) gpu.Dfree(d_points); } - affine_h *get_d_points() - { - return d_points; - } - void set_d_points(affine_h *d_points) - { + + affine_h *get_d_points() { return d_points; } + void set_d_points(affine_h *d_points) { assert(!this->owned); this->d_points = d_points; } -private: - void digits(const scalar_t d_scalars[], size_t len, - vec2d_t &d_digits, vec2d_t &d_temps, - bool mont, uint32_t *d_pidx) - { + private: + void digits(const scalar_t d_scalars[], size_t len, vec2d_t &d_digits, vec2d_t &d_temps, bool mont, + uint32_t *d_pidx) { // Using larger grid size doesn't make 'sort' run faster, actually // quite contrary. Arguably because global memory bus gets // thrashed... Stepping far outside the sweet spot has significant @@ -464,8 +411,8 @@ private: while (grid_size & (grid_size - 1)) grid_size -= (grid_size & (0 - grid_size)); - breakdown<<<2 * grid_size, 1024, sizeof(scalar_t) * 1024, gpu[2]>>>( - d_digits, d_scalars, len, nwins, wbits, mont); + breakdown<<<2 * grid_size, 1024, sizeof(scalar_t) * 1024, gpu[2]>>>(d_digits, d_scalars, len, nwins, wbits, + mont); CUDA_OK(cudaGetLastError()); const size_t shared_sz = sizeof(uint32_t) << DIGIT_BITS; @@ -485,35 +432,73 @@ private: // ~50% slower but sort twice as much data... uint32_t top = scalar_t::bit_length() - wbits * (nwins - 1); uint32_t win; - for (win = 0; win < nwins - 1; win += 2) - { - gpu[2].launch_coop(sort, {{grid_size, 2}, SORT_BLOCKDIM, shared_sz}, - d_digits, len, win, d_temps, d_hist, - wbits - 1, wbits - 1, win == nwins - 2 ? top - 1 : wbits - 1, - d_pidx); + for (win = 0; win < nwins - 1; win += 2) { + gpu[2].launch_coop(sort, {{grid_size, 2}, SORT_BLOCKDIM, shared_sz}, d_digits, len, win, d_temps, d_hist, + wbits - 1, wbits - 1, win == nwins - 2 ? top - 1 : wbits - 1, d_pidx); } - if (win < nwins) - { - gpu[2].launch_coop(sort, {{grid_size, 1}, SORT_BLOCKDIM, shared_sz}, - d_digits, len, win, d_temps, d_hist, + if (win < nwins) { + gpu[2].launch_coop(sort, {{grid_size, 1}, SORT_BLOCKDIM, shared_sz}, d_digits, len, win, d_temps, d_hist, wbits - 1, top - 1, 0u, d_pidx); } #endif } -public: - RustError invoke(point_t &out, const affine_t *points, size_t npoints, - const scalar_t *scalars, uint32_t pidx[], bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) - { - assert(this->npoints == 0 || npoints <= this->npoints); + public: + // Compute various constants (stride length, window size) based on the number of scalars. + // Also allocate scratch space. + void setup_scalars(size_t nscalars) { + // nscalars = (nscalars + WARP_SZ - 1) & ~(WARP_SZ - 1); + uint32_t lg_n = lg2(nscalars + nscalars / 2); + + // Compute window size + wbits = 17; + if (nscalars > 192) { + wbits = std::min(lg_n - 8, (uint32_t)18); + if (wbits < 10) + wbits = 10; + } else if (nscalars > 0) { + wbits = 10; + } + nwins = (scalar_t::bit_length() - 1) / wbits + 1; - uint32_t lg_npoints = lg2(npoints + npoints / 2); - size_t batch = 1 << (std::max(lg_npoints, wbits) - wbits); + // Allocate the buckets and histogram + uint32_t row_sz = 1U << (wbits - 1); + size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); + d_buckets_sz *= sizeof(bucket_h); + size_t d_hist_sz = nwins * row_sz * sizeof(uint32_t); + + // Compute how big each batch should be + batch = 1 << (std::max(lg_n, wbits) - wbits); batch >>= 6; batch = batch ? batch : 1; - uint32_t stride = (npoints + batch - 1) / batch; - stride = (stride + WARP_SZ - 1) & ((size_t)0 - WARP_SZ); + stride = (npoints + batch - 1) / batch; + stride = (stride + WARP_SZ - 1) & ~(WARP_SZ - 1); + + // Allocate the memory required for each batch + size_t scalars_sz = stride * std::max(2 * sizeof(uint2), sizeof(scalar_t)); + size_t pidx_sz = sizeof(uint32_t) * stride; + size_t digits_sz = nwins * stride * sizeof(uint32_t); + + size_t d_blob_sz = d_buckets_sz + d_hist_sz + scalars_sz + pidx_sz + digits_sz; + d_total_blob = reinterpret_cast(gpu.Dmalloc(d_blob_sz)); + size_t offset = 0; + + d_buckets = reinterpret_cast(d_total_blob); + offset += d_buckets_sz; + d_hist = vec2d_t(reinterpret_cast(&d_total_blob[offset]), row_sz); + offset += d_hist_sz; + + d_temps = vec2d_t(reinterpret_cast(&d_total_blob[offset]), stride); + d_scalars = reinterpret_cast(&d_total_blob[offset]); + offset += scalars_sz; + d_pidx = reinterpret_cast(&d_total_blob[offset]); + offset += pidx_sz; + d_digits = vec2d_t(reinterpret_cast(&d_total_blob[offset]), stride); + offset += digits_sz; + } + + RustError invoke(point_t &out, const scalar_t *scalars, size_t nscalars, uint32_t pidx[], bool mont = true) { + setup_scalars(nscalars); std::vector res(nwins); std::vector ones(gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); @@ -521,105 +506,48 @@ public: out.inf(); point_t p; - try - { - // |scalars| being nullptr means the scalars are pre-loaded to - // |d_scalars|, otherwise allocate stride. - size_t scalars_sz = scalars ? sizeof(scalar_t) : 0; - scalars_sz = stride * std::max(2 * sizeof(uint2), scalars_sz); - - size_t pidx_sz = pidx ? sizeof(uint32_t) : 0; - pidx_sz *= stride; - - // |points| being nullptr means the points are pre-loaded to - // |d_points|, otherwise allocate double-stride. - size_t points_sz = points ? (batch > 1 ? 2 * stride : stride) : 0; - points_sz *= sizeof(affine_h); - - size_t digits_sz = nwins * stride * sizeof(uint32_t); - - dev_ptr_t d_temp{scalars_sz + pidx_sz + digits_sz + points_sz, gpu[2]}; - - vec2d_t d_temps{&d_temp[0], stride}; - vec2d_t d_digits{&d_temp[scalars_sz + pidx_sz], stride}; - - scalar_t *d_scalars = scalars ? (scalar_t *)&d_temp[0] - : this->d_scalars; - uint32_t *d_pidx = pidx ? (uint32_t *)&d_temp[scalars_sz] - : this->d_pidx; - affine_h *d_points = points ? (affine_h *)&d_temp[scalars_sz + pidx_sz + digits_sz] - : this->d_points; - + try { size_t d_off = 0; // device offset size_t h_off = 0; // host offset size_t num = stride > npoints ? npoints : stride; event_t ev; - if (scalars) - gpu[2].HtoD(&d_scalars[d_off], &scalars[h_off], num); - if (pidx) - gpu[2].HtoD(&d_pidx[d_off], &pidx[h_off], num); + gpu[2].HtoD(&d_scalars[0], &scalars[h_off], num); + if (pidx) gpu[2].HtoD(&d_pidx[d_off], &pidx[h_off], num); + digits(&d_scalars[0], num, d_digits, d_temps, mont, d_pidx); gpu[2].record(ev); - if (points) { - gpu[0].HtoD(&d_points[d_off], &points[h_off], - num, ffi_affine_sz); - CUDA_OK(cudaGetLastError()); - } - - for (uint32_t i = 0; i < batch; i++) - { + for (uint32_t i = 0; i < batch; i++) { gpu[i & 1].wait(ev); - batch_addition<<>>( - &d_buckets[nwins << (wbits - 1)], &d_points[pidx ? 0 : d_off], num, - &d_digits[0][0], d_hist[0][0]); + batch_addition<<>>( + &d_buckets[nwins << (wbits - 1)], &d_points[pidx ? 0 : d_off], num, &d_digits[0][0], d_hist[0][0]); CUDA_OK(cudaGetLastError()); - gpu[i & 1].launch_coop(accumulate, - {gpu.sm_count(), 0}, - d_buckets, nwins, wbits, &d_points[pidx ? 0 : d_off], d_digits, d_hist, i & 1); + gpu[i & 1].launch_coop(accumulate, {gpu.sm_count(), 0}, d_buckets, nwins, wbits, + &d_points[pidx ? 0 : d_off], d_digits, d_hist, i & 1); CUDA_OK(cudaGetLastError()); gpu[i & 1].record(ev); - integrate<<>>( - d_buckets, nwins, wbits, scalar_t::bit_length()); + integrate + <<>>( + d_buckets, nwins, wbits, scalar_t::bit_length()); CUDA_OK(cudaGetLastError()); - if (i < batch - 1) - { - h_off += stride; - num = h_off + stride <= npoints ? stride : npoints - h_off; + if (i < batch - 1) { + d_off += stride; + num = d_off + stride <= npoints ? stride : npoints - d_off; + + gpu[2].HtoD(&d_scalars[0], &scalars[d_off], num); + if (pidx) gpu[2].HtoD(&d_pidx[0], &pidx[d_off], num); - if (scalars) - gpu[2].HtoD(&d_scalars[0], &scalars[h_off], num); - if (pidx) - gpu[2].HtoD(&d_pidx[0], &pidx[h_off], num); gpu[2].wait(ev); - digits(&d_scalars[scalars ? 0 : h_off], num, - d_digits, d_temps, mont, d_pidx); + digits(&d_scalars[0], num, d_digits, d_temps, mont, d_pidx); gpu[2].record(ev); - - if (points) - { - size_t j = (i + 1) & 1; - d_off = j ? stride : 0; - gpu[j].HtoD(&d_points[d_off], &points[h_off], - num, ffi_affine_sz); - CUDA_OK(cudaGetLastError()); - } - else - { - d_off = h_off; - } } - if (i > 0) - { + if (i > 0) { collect(p, res, ones); out.add(p); } @@ -628,9 +556,7 @@ public: gpu[i & 1].DtoH(res, d_buckets, sizeof(bucket_h) << (wbits - 1)); gpu[i & 1].sync(); } - } - catch (const cuda_error &e) - { + } catch (const cuda_error &e) { gpu.sync(); #ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE return RustError{e.code(), e.what()}; @@ -645,53 +571,15 @@ public: return RustError{cudaSuccess}; } - RustError invoke(point_t &out, const affine_t *points, size_t npoints, - gpu_ptr_t scalars, bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) - { - d_scalars = scalars; - return invoke(out, points, npoints, nullptr, mont, ffi_affine_sz); - } - - RustError invoke(point_t &out, vec_t scalars, uint32_t pidx[], bool mont = true) - { - return invoke(out, nullptr, scalars.size(), scalars, pidx, mont); - } - - RustError invoke(point_t &out, vec_t points, - const scalar_t *scalars, uint32_t pidx[], bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) - { - return invoke(out, points, points.size(), scalars, pidx, mont, ffi_affine_sz); - } - - RustError invoke(point_t &out, vec_t points, - vec_t scalars, uint32_t pidx[], bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) - { - return invoke(out, points, points.size(), scalars, pidx, mont, ffi_affine_sz); - } - - RustError invoke(point_t &out, const std::vector &points, - const std::vector &scalars, uint32_t pidx[], - bool mont = true, size_t ffi_affine_sz = sizeof(affine_t)) - { - return invoke(out, points.data(), - std::min(points.size(), scalars.size()), - scalars.data(), nullptr, mont, ffi_affine_sz); - } - -private: - point_t integrate_row(const result_t &row, uint32_t lsbits) - { + private: + point_t integrate_row(const result_t &row, uint32_t lsbits) { const int NTHRBITS = lg2(MSM_NTHREADS / bucket_t::degree); assert(wbits - 1 > NTHRBITS); size_t i = MSM_NTHREADS / bucket_t::degree - 1; - if (lsbits - 1 <= NTHRBITS) - { + if (lsbits - 1 <= NTHRBITS) { size_t mask = (1U << (NTHRBITS - (lsbits - 1))) - 1; bucket_t res, acc = row[i][1]; @@ -700,8 +588,7 @@ private: else res = acc; - while (i--) - { + while (i--) { acc.add(row[i][1]); if ((i & mask) == 0) res.add(acc); @@ -713,8 +600,7 @@ private: point_t res = row[i][0]; bucket_t acc = row[i][1]; - while (i--) - { + while (i--) { point_t raise = acc; for (size_t j = 0; j < lsbits - 1 - NTHRBITS; j++) raise.dbl(); @@ -727,11 +613,8 @@ private: return res; } - void collect(point_t &out, const std::vector &res, - const std::vector &ones) - { - struct tile_t - { + void collect(point_t &out, const std::vector &res, const std::vector &ones) { + struct tile_t { uint32_t x, y, dy; point_t p; tile_t() {} @@ -745,8 +628,7 @@ private: grid[0].dy = scalar_t::bit_length() - y * wbits; total++; - while (y--) - { + while (y--) { grid[total].x = grid[0].x; grid[total].y = y; grid[total].dy = wbits; @@ -758,29 +640,26 @@ private: channel_t ch; auto n_workers = min((uint32_t)gpu.ncpus(), total); - while (n_workers--) - { - gpu.spawn([&, this, total, counter]() - { + while (n_workers--) { + gpu.spawn([&, this, total, counter]() { for (size_t work; (work = counter++) < total;) { auto item = &grid[work]; auto y = item->y; item->p = integrate_row(res[y], item->dy); if (++row_sync[y] == 1) ch.send(y); - } }); + } + }); } point_t one = sum_up(ones); out.inf(); size_t row = 0, ny = nwins; - while (ny--) - { + while (ny--) { auto y = ch.recv(); row_sync[y] = -1U; - while (grid[row].y == y) - { + while (grid[row].y == y) { while (row < total && grid[row].y == y) out.add(grid[row++].p); if (y == 0) @@ -795,34 +674,22 @@ private: } }; -template -struct msm_context_t -{ +template struct msm_context_t { T *d_points; size_t npoints; }; -template -void drop_msm_context_t(msm_context_t &ref) -{ - CUDA_OK(cudaFree(ref.d_points)); -} +template void drop_msm_context_t(msm_context_t &ref) { CUDA_OK(cudaFree(ref.d_points)); } -template -static RustError mult_pippenger_init(const affine_t points[], size_t npoints, - msm_context_t *msm_context) -{ - try - { +static RustError mult_pippenger_init(const affine_t points[], size_t npoints, msm_context_t *msm_context) { + try { msm_t msm{points, npoints, false}; msm_context->d_points = msm.get_d_points(); msm_context->npoints = npoints; return RustError{cudaSuccess}; - } - catch (const cuda_error &e) - { + } catch (const cuda_error &e) { #ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE return RustError{e.code(), e.what()}; #else @@ -832,18 +699,12 @@ static RustError mult_pippenger_init(const affine_t points[], size_t npoints, } template -static RustError mult_pippenger(point_t *out, const affine_t points[], size_t npoints, - const scalar_t scalars[], bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) -{ - try - { - msm_t msm{nullptr, npoints, true}; - return msm.invoke(*out, slice_t{points, npoints}, - scalars, nullptr, mont, ffi_affine_sz); - } - catch (const cuda_error &e) - { +static RustError mult_pippenger(point_t *out, const affine_t points[], size_t npoints, const scalar_t scalars[], + size_t nscalars, bool mont = true) { + try { + msm_t msm{points, npoints, true}; + return msm.invoke(*out, scalars, nscalars, nullptr, mont); + } catch (const cuda_error &e) { out->inf(); #ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE return RustError{e.code(), e.what()}; @@ -853,22 +714,14 @@ static RustError mult_pippenger(point_t *out, const affine_t points[], size_t np } } -template -static RustError mult_pippenger_with(point_t *out, msm_context_t *msm_context, size_t npoints, - const scalar_t scalars[], uint32_t pidx[], bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) -{ - try - { - msm_t msm{nullptr, npoints, false}; - msm.set_d_points(msm_context->d_points); - return msm.invoke(*out, nullptr, npoints, scalars, pidx, - mont, ffi_affine_sz); - } - catch (const cuda_error &e) - { +static RustError mult_pippenger_with(point_t *out, msm_context_t *msm_context, + const scalar_t scalars[], size_t nscalars, uint32_t pidx[], bool mont = true) { + try { + msm_t msm{msm_context->d_points, msm_context->npoints}; + return msm.invoke(*out, scalars, nscalars, pidx, mont); + } catch (const cuda_error &e) { out->inf(); #ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE return RustError{e.code(), e.what()}; diff --git a/msm/pippenger.hpp b/msm/pippenger.hpp index c71366c..421111f 100644 --- a/msm/pippenger.hpp +++ b/msm/pippenger.hpp @@ -5,41 +5,38 @@ #ifndef __SPPARK_MSM_PIPPENGER_HPP__ #define __SPPARK_MSM_PIPPENGER_HPP__ -#include #include #include +#include /* Works up to 25 bits. */ -static size_t get_wval(const unsigned char *d, size_t off, size_t bits) -{ - size_t i, top = (off + bits - 1)/8; +static size_t get_wval(const unsigned char *d, size_t off, size_t bits) { + size_t i, top = (off + bits - 1) / 8; size_t ret, mask = (size_t)0 - 1; - d += off/8; - top -= off/8-1; + d += off / 8; + top -= off / 8 - 1; /* this is not about constant-time-ness, but branch optimization */ - for (ret=0, i=0; i<4;) { - ret |= (*d & mask) << (8*i); - mask = (size_t)0 - ((++i - top) >> (8*sizeof(top)-1)); + for (ret = 0, i = 0; i < 4;) { + ret |= (*d & mask) << (8 * i); + mask = (size_t)0 - ((++i - top) >> (8 * sizeof(top) - 1)); d += 1 & mask; } - return ret >> (off%8); + return ret >> (off % 8); } -static inline size_t window_size(size_t npoints) -{ +static inline size_t window_size(size_t npoints) { size_t wbits; - for (wbits=0; npoints>>=1; wbits++) ; + for (wbits = 0; npoints >>= 1; wbits++) + ; - return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1)); + return wbits > 12 ? wbits - 3 : (wbits > 4 ? wbits - 2 : (wbits ? 2 : 1)); } -template -static void integrate_buckets(point_t& out, bucket_t buckets[], size_t wbits) -{ +template static void integrate_buckets(point_t &out, bucket_t buckets[], size_t wbits) { bucket_t ret, acc; size_t n = (size_t)1 << wbits; @@ -55,18 +52,14 @@ static void integrate_buckets(point_t& out, bucket_t buckets[], size_t wbits) out = ret; } -template -static void bucket(bucket_t buckets[], size_t booth_idx, - size_t wbits, const affine_t& p) -{ - booth_idx &= (1< +static void bucket(bucket_t buckets[], size_t booth_idx, size_t wbits, const affine_t &p) { + booth_idx &= (1 << wbits) - 1; if (booth_idx--) buckets[booth_idx].add(p); } -template -static void prefetch(const bucket_t buckets[], size_t booth_idx, size_t wbits) -{ +template static void prefetch(const bucket_t buckets[], size_t booth_idx, size_t wbits) { #if 0 booth_idx &= (1< -static void tile(point_t& ret, const affine_t points[], size_t npoints, - const unsigned char* scalars, size_t nbits, - bucket_t buckets[], size_t bit0, size_t wbits, size_t cbits) -{ +template +static void tile(point_t &ret, const affine_t points[], size_t npoints, const unsigned char *scalars, size_t nbits, + bucket_t buckets[], size_t bit0, size_t wbits, size_t cbits) { size_t wmask, wval, wnxt; size_t i, nbytes; - nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ + nbytes = (nbits + 7) / 8; /* convert |nbits| to bytes */ wmask = ((size_t)1 << wbits) - 1; wval = get_wval(scalars, bit0, wbits) & wmask; scalars += nbytes; wnxt = get_wval(scalars, bit0, wbits) & wmask; - npoints--; /* account for prefetch */ + npoints--; /* account for prefetch */ bucket(buckets, wval, cbits, points[0]); for (i = 1; i < npoints; i++) { @@ -105,61 +96,67 @@ static void tile(point_t& ret, const affine_t points[], size_t npoints, integrate_buckets(ret, buckets, cbits); } -template -static size_t num_bits(T l) -{ - const size_t T_BITS = 8*sizeof(T); -# define MSB(x) ((T)(x) >> (T_BITS-1)) +template static size_t num_bits(T l) { + const size_t T_BITS = 8 * sizeof(T); +#define MSB(x) ((T)(x) >> (T_BITS - 1)) T x, mask; - if ((T)-1 < 0) { // handle signed T + if ((T)-1 < 0) { // handle signed T mask = MSB(l); l ^= mask; l += 1 & mask; } - size_t bits = (((T)(~l & (l-1)) >> (T_BITS-1)) & 1) ^ 1; + size_t bits = (((T)(~l & (l - 1)) >> (T_BITS - 1)) & 1) ^ 1; if (sizeof(T) > 4) { - x = l >> (32 & (T_BITS-1)); - mask = MSB(0 - x); if ((T)-1 > 0) mask = 0 - mask; + x = l >> (32 & (T_BITS - 1)); + mask = MSB(0 - x); + if ((T)-1 > 0) + mask = 0 - mask; bits += 32 & mask; l ^= (x ^ l) & mask; } if (sizeof(T) > 2) { x = l >> 16; - mask = MSB(0 - x); if ((T)-1 > 0) mask = 0 - mask; + mask = MSB(0 - x); + if ((T)-1 > 0) + mask = 0 - mask; bits += 16 & mask; l ^= (x ^ l) & mask; } if (sizeof(T) > 1) { x = l >> 8; - mask = MSB(0 - x); if ((T)-1 > 0) mask = 0 - mask; + mask = MSB(0 - x); + if ((T)-1 > 0) + mask = 0 - mask; bits += 8 & mask; l ^= (x ^ l) & mask; } x = l >> 4; - mask = MSB(0 - x); if ((T)-1 > 0) mask = 0 - mask; + mask = MSB(0 - x); + if ((T)-1 > 0) + mask = 0 - mask; bits += 4 & mask; l ^= (x ^ l) & mask; x = l >> 2; - mask = MSB(0 - x); if ((T)-1 > 0) mask = 0 - mask; + mask = MSB(0 - x); + if ((T)-1 > 0) + mask = 0 - mask; bits += 2 & mask; l ^= (x ^ l) & mask; bits += l >> 1; return bits; -# undef MSB +#undef MSB } -std::tuple -static breakdown(size_t nbits, size_t window, size_t ncpus) -{ +std::tuple static breakdown(size_t nbits, size_t window, size_t ncpus) { size_t nx, ny, wnd; if (nbits > window * ncpus) { @@ -168,7 +165,7 @@ static breakdown(size_t nbits, size_t window, size_t ncpus) wnd = window - wnd; } else { wnd = (nbits / window + ncpus - 1) / ncpus; - if ((nbits / (window+1) + ncpus - 1) / ncpus < wnd) + if ((nbits / (window + 1) + ncpus - 1) / ncpus < wnd) wnd = window + 1; else wnd = window; @@ -190,19 +187,17 @@ static breakdown(size_t nbits, size_t window, size_t ncpus) } template -static void mult(point_t& ret, const affine_t& point, - const pow_t scalar, size_t top) -{ +static void mult(point_t &ret, const affine_t &point, const pow_t scalar, size_t top) { ret.inf(); if (point.is_inf()) return; struct is_bit { - static bool set(const pow_t v, size_t i) - { return (v[i/8] >> (i%8)) & 1; } + static bool set(const pow_t v, size_t i) { return (v[i / 8] >> (i % 8)) & 1; } }; - while (--top && !is_bit::set(scalar, top)) ; + while (--top && !is_bit::set(scalar, top)) + ; if (is_bit::set(scalar, top)) { ret = point; while (top--) { @@ -215,19 +210,16 @@ static void mult(point_t& ret, const affine_t& point, #include -template -static void mult_pippenger(point_t& ret, const affine_t points[], size_t npoints, - const scalar_t _scalars[], bool mont, - thread_pool_t* da_pool = nullptr) -{ +template +static void mult_pippenger(point_t &ret, const affine_t points[], size_t npoints, const scalar_t _scalars[], bool mont, + thread_pool_t *da_pool = nullptr) { typedef typename scalar_t::pow_t pow_t; size_t nbits = scalar_t::nbits; size_t window = window_size(npoints); size_t ncpus = da_pool ? da_pool->size() : 0; // below is little-endian dependency, should it be removed? - const pow_t* scalars = reinterpret_cast(_scalars); + const pow_t *scalars = reinterpret_cast(_scalars); std::unique_ptr store = nullptr; if (mont) { store = decltype(store)(new pow_t[npoints]); @@ -235,9 +227,7 @@ static void mult_pippenger(point_t& ret, const affine_t points[], size_t npoints for (size_t i = 0; i < npoints; i++) _scalars[i].to_scalar(store[i]); } else { - da_pool->par_map(npoints, 512, [&](size_t i) { - _scalars[i].to_scalar(store[i]); - }); + da_pool->par_map(npoints, 512, [&](size_t i) { _scalars[i].to_scalar(store[i]); }); } scalars = &store[0]; } @@ -255,18 +245,15 @@ static void mult_pippenger(point_t& ret, const affine_t points[], size_t npoints /* top excess bits modulo target window size */ size_t wbits = nbits % window, /* yes, it may be zero */ - cbits = wbits + 1, - bit0 = nbits; + cbits = wbits + 1, bit0 = nbits; while (bit0 -= wbits) { - tile(p, points, npoints, scalars[0], nbits, - &buckets[0], bit0, wbits, cbits); + tile(p, points, npoints, scalars[0], nbits, &buckets[0], bit0, wbits, cbits); ret.add(p); for (size_t i = 0; i < window; i++) ret.dbl(); cbits = wbits = window; } - tile(p, points, npoints, scalars[0], nbits, - &buckets[0], 0, wbits, cbits); + tile(p, points, npoints, scalars[0], nbits, &buckets[0], 0, wbits, cbits); ret.add(p); return; } @@ -281,14 +268,13 @@ static void mult_pippenger(point_t& ret, const affine_t points[], size_t npoints }; std::vector grid(nx * ny); - size_t dx = npoints / nx, - y = window * (ny - 1); + size_t dx = npoints / nx, y = window * (ny - 1); size_t total = 0; while (total < nx) { - grid[total].x = total * dx; + grid[total].x = total * dx; grid[total].dx = dx; - grid[total].y = y; + grid[total].y = y; grid[total].dy = nbits - y; total++; } @@ -297,9 +283,9 @@ static void mult_pippenger(point_t& ret, const affine_t points[], size_t npoints while (y) { y -= window; for (size_t i = 0; i < nx; i++, total++) { - grid[total].x = grid[i].x; + grid[total].x = grid[i].x; grid[total].dx = grid[i].dx; - grid[total].y = y; + grid[total].y = y; grid[total].dy = window; } } @@ -316,13 +302,8 @@ static void mult_pippenger(point_t& ret, const affine_t points[], size_t npoints std::vector buckets(1 << window); /* zeroed */ do { - size_t x = grid[work].x, - dx = grid[work].dx, - y = grid[work].y, - dy = grid[work].dy; - tile(grid[work].p, &points[x], dx, - scalars[x], nbits, &buckets[0], - y, dy, dy + (dy < window)); + size_t x = grid[work].x, dx = grid[work].dx, y = grid[work].y, dy = grid[work].dy; + tile(grid[work].p, &points[x], dx, scalars[x], nbits, &buckets[0], y, dy, dy + (dy < window)); if (++row_sync[y / window] == nx) ch.send(y); } while ((work = counter++) < total); @@ -349,27 +330,19 @@ static void mult_pippenger(point_t& ret, const affine_t points[], size_t npoints } } -template -static void mult_pippenger(point_t& ret, const std::vector& points, - const std::vector& scalars, bool mont, - thread_pool_t* da_pool = nullptr) -{ - mult_pippenger(ret, points.data(), - std::min(points.size(), scalars.size()), - scalars.data(), mont, da_pool); +template +static void mult_pippenger(point_t &ret, const std::vector &points, const std::vector &scalars, + bool mont, thread_pool_t *da_pool = nullptr) { + mult_pippenger(ret, points.data(), std::min(points.size(), scalars.size()), scalars.data(), mont, + da_pool); } #include -template -static void mult_pippenger(point_t& ret, slice_t points, - slice_t scalars, bool mont, - thread_pool_t* da_pool = nullptr) -{ - mult_pippenger(ret, points.data(), - std::min(points.size(), scalars.size()), - scalars.data(), mont, da_pool); +template +static void mult_pippenger(point_t &ret, slice_t points, slice_t scalars, bool mont, + thread_pool_t *da_pool = nullptr) { + mult_pippenger(ret, points.data(), std::min(points.size(), scalars.size()), scalars.data(), mont, + da_pool); } #endif