diff --git a/msm/batch_addition.cuh b/msm/batch_addition.cuh index c4140dc..714bac8 100644 --- a/msm/batch_addition.cuh +++ b/msm/batch_addition.cuh @@ -10,30 +10,29 @@ #include #ifndef WARP_SZ -# define WARP_SZ 32 +#define WARP_SZ 32 #endif #define BATCH_ADD_BLOCK_SIZE 256 #ifndef BATCH_ADD_NSTREAMS -# define BATCH_ADD_NSTREAMS 8 +#define BATCH_ADD_NSTREAMS 8 #elif BATCH_ADD_NSTREAMS == 0 -# error "invalid BATCH_ADD_NSTREAMS" +#error "invalid BATCH_ADD_NSTREAMS" #endif -template -__device__ __forceinline__ -static void add(bucket_h ret[], const affine_h points[], uint32_t npoints, - const uint32_t bitmap[], const uint32_t refmap[], - bool accumulate, uint32_t sid) +template +__device__ __forceinline__ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints, + const uint32_t bitmap[], const uint32_t refmap[], + bool accumulate, uint32_t sid) { static __device__ uint32_t streams[BATCH_ADD_NSTREAMS]; - uint32_t& current = streams[sid % BATCH_ADD_NSTREAMS]; + uint32_t ¤t = streams[sid % BATCH_ADD_NSTREAMS]; const uint32_t degree = bucket_t::degree; const uint32_t warp_sz = WARP_SZ / degree; - const uint32_t tid = (threadIdx.x + blockDim.x*blockIdx.x) / degree; + const uint32_t tid = (threadIdx.x + blockDim.x * blockIdx.x) / degree; const uint32_t xid = tid % warp_sz; uint32_t laneid; @@ -42,31 +41,35 @@ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints, bucket_t acc; acc.inf(); - if (accumulate && tid < gridDim.x*blockDim.x/WARP_SZ) + if (accumulate && tid < gridDim.x * blockDim.x / WARP_SZ) acc = ret[tid]; - uint32_t base = laneid == 0 ? atomicAdd(¤t, 32*WARP_SZ) : 0; + uint32_t base = laneid == 0 ? atomicAdd(¤t, 32 * WARP_SZ) : 0; base = __shfl_sync(0xffffffff, base, 0); - uint32_t chunk = min(32*WARP_SZ, npoints-base); + uint32_t chunk = min(32 * WARP_SZ, npoints - base); uint32_t bits, refs, word, sign = 0, off = 0xffffffff; - for (uint32_t i = 0, j = 0; base < npoints;) { - if (i == 0) { - bits = bitmap[base/WARP_SZ + laneid]; - refs = refmap ? refmap[base/WARP_SZ + laneid] : 0; + for (uint32_t i = 0, j = 0; base < npoints;) + { + if (i == 0) + { + bits = bitmap[base / WARP_SZ + laneid]; + refs = refmap ? refmap[base / WARP_SZ + laneid] : 0; bits ^= refs; refs &= bits; } - for (; i < chunk && j < warp_sz; i++) { - if (i%32 == 0) - word = __shfl_sync(0xffffffff, bits, i/32); - if (refmap && (i%32 == 0)) - sign = __shfl_sync(0xffffffff, refs, i/32); + for (; i < chunk && j < warp_sz; i++) + { + if (i % 32 == 0) + word = __shfl_sync(0xffffffff, bits, i / 32); + if (refmap && (i % 32 == 0)) + sign = __shfl_sync(0xffffffff, refs, i / 32); - if (word & 1) { + if (word & 1) + { if (j++ == xid) off = (base + i) | (sign << 31); } @@ -74,15 +77,18 @@ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints, sign >>= 1; } - if (i == chunk) { - base = laneid == 0 ? atomicAdd(¤t, 32*WARP_SZ) : 0; + if (i == chunk) + { + base = laneid == 0 ? atomicAdd(¤t, 32 * WARP_SZ) : 0; base = __shfl_sync(0xffffffff, base, 0); - chunk = min(32*WARP_SZ, npoints-base); + chunk = min(32 * WARP_SZ, npoints - base); i = 0; } - if (base >= npoints || j == warp_sz) { - if (off != 0xffffffff) { + if (base >= npoints || j == warp_sz) + { + if (off != 0xffffffff) + { affine_t p = points[off & 0x7fffffff]; if (degree == 2) acc.uadd(p, off >> 31); @@ -95,11 +101,12 @@ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints, } #ifdef __CUDA_ARCH__ - for (uint32_t off = 1; off < warp_sz;) { - bucket_t down = acc.shfl_down(off*degree); + for (uint32_t off = 1; off < warp_sz;) + { + bucket_t down = acc.shfl_down(off * degree); off <<= 1; - if ((xid & (off-1)) == 0) + if ((xid & (off - 1)) == 0) acc.uadd(down); // .add() triggers spills ... in .shfl_down() } #endif @@ -107,46 +114,51 @@ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints, cooperative_groups::this_grid().sync(); if (xid == 0) - ret[tid/warp_sz] = acc; + ret[tid / warp_sz] = acc; if (threadIdx.x + blockIdx.x == 0) current = 0; } -template +template __launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__ -void batch_addition(bucket_h ret[], const affine_h points[], uint32_t npoints, - const uint32_t bitmap[], bool accumulate = false, - uint32_t sid = 0) -{ add(ret, points, npoints, bitmap, nullptr, accumulate, sid); } - -template + void batch_addition(bucket_h ret[], const affine_h points[], uint32_t npoints, + const uint32_t bitmap[], bool accumulate = false, + uint32_t sid = 0) +{ + add(ret, points, npoints, bitmap, nullptr, accumulate, sid); +} + +template __launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__ -void batch_diff(bucket_h ret[], const affine_h points[], uint32_t npoints, - const uint32_t bitmap[], const uint32_t refmap[], - bool accumulate = false, uint32_t sid = 0) -{ add(ret, points, npoints, bitmap, refmap, accumulate, sid); } - -template + void batch_diff(bucket_h ret[], const affine_h points[], uint32_t npoints, + const uint32_t bitmap[], const uint32_t refmap[], + bool accumulate = false, uint32_t sid = 0) +{ + add(ret, points, npoints, bitmap, refmap, accumulate, sid); +} + +template __launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__ -void batch_addition(bucket_h ret[], const affine_h points[], size_t npoints, - const uint32_t digits[], const uint32_t& ndigits) + void batch_addition(bucket_h ret[], const affine_h points[], size_t npoints, + const uint32_t digits[], const uint32_t &ndigits) { const uint32_t degree = bucket_t::degree; const uint32_t warp_sz = WARP_SZ / degree; - const uint32_t tid = (threadIdx.x + blockDim.x*blockIdx.x) / degree; + const uint32_t tid = (threadIdx.x + blockDim.x * blockIdx.x) / degree; const uint32_t xid = tid % warp_sz; bucket_t acc; acc.inf(); - for (size_t i = tid; i < ndigits; i += gridDim.x*blockDim.x/degree) { + for (size_t i = tid; i < ndigits; i += gridDim.x * blockDim.x / degree) + { uint32_t digit = digits[i]; affine_t p = points[digit & 0x7fffffff]; if (degree == 2) @@ -156,20 +168,21 @@ void batch_addition(bucket_h ret[], const affine_h points[], size_t npoints, } #ifdef __CUDA_ARCH__ - for (uint32_t off = 1; off < warp_sz;) { - bucket_t down = acc.shfl_down(off*degree); + for (uint32_t off = 1; off < warp_sz;) + { + bucket_t down = acc.shfl_down(off * degree); off <<= 1; - if ((xid & (off-1)) == 0) + if ((xid & (off - 1)) == 0) acc.uadd(down); // .add() triggers spills ... in .shfl_down() } #endif if (xid == 0) - ret[tid/warp_sz] = acc; + ret[tid / warp_sz] = acc; } -template +template bucket_t sum_up(const bucket_t inp[], size_t n) { bucket_t sum = inp[0]; @@ -178,7 +191,49 @@ bucket_t sum_up(const bucket_t inp[], size_t n) return sum; } -template -bucket_t sum_up(const std::vector& inp) -{ return sum_up(&inp[0], inp.size()); } +template +bucket_t sum_up(const std::vector &inp) +{ + return sum_up(&inp[0], inp.size()); +} + +template +__global__ void compress(bucket_h ret[], const affine_h points[], size_t npoints, + const uint32_t digits[], const uint32_t &ndigits) +{ + const uint32_t degree = bucket_t::degree; + const uint32_t warp_sz = WARP_SZ / degree; + const uint32_t tid = (threadIdx.x + blockDim.x * blockIdx.x) / degree; + const uint32_t xid = tid % warp_sz; + + bucket_t acc; + acc.inf(); + + for (size_t i = tid; i < ndigits; i += gridDim.x * blockDim.x / degree) + { + uint32_t digit = digits[i]; + affine_t p = points[digit & 0x7fffffff]; + if (degree == 2) + acc.uadd(p, digit >> 31); + else + acc.add(p, digit >> 31); + } + +#ifdef __CUDA_ARCH__ + for (uint32_t off = 1; off < warp_sz;) + { + bucket_t down = acc.shfl_down(off * degree); + + off <<= 1; + if ((xid & (off - 1)) == 0) + acc.uadd(down); // .add() triggers spills ... in .shfl_down() + } +#endif + + if (xid == 0) + ret[tid / warp_sz] = acc; +} + #endif diff --git a/msm/pippenger.cuh b/msm/pippenger.cuh index 104da21..1e879b0 100644 --- a/msm/pippenger.cuh +++ b/msm/pippenger.cuh @@ -362,6 +362,7 @@ class msm_t bucket_h *d_buckets; affine_h *d_points; scalar_t *d_scalars; + uint32_t *d_pidx; vec2d_t d_hist; bool owned; @@ -389,7 +390,7 @@ class msm_t public: msm_t(const affine_t points[], size_t np, bool owned, size_t ffi_affine_sz = sizeof(affine_t), int device_id = -1) - : owned(owned), gpu(select_gpu(device_id)), d_points(nullptr), d_scalars(nullptr) + : owned(owned), gpu(select_gpu(device_id)), d_points(nullptr), d_scalars(nullptr), d_pidx(nullptr) { npoints = (np + WARP_SZ - 1) & ((size_t)0 - WARP_SZ); @@ -449,7 +450,8 @@ public: private: void digits(const scalar_t d_scalars[], size_t len, - vec2d_t &d_digits, vec2d_t &d_temps, bool mont) + vec2d_t &d_digits, vec2d_t &d_temps, + bool mont, uint32_t *d_pidx) { // Using larger grid size doesn't make 'sort' run faster, actually // quite contrary. Arguably because global memory bus gets @@ -487,20 +489,21 @@ private: { gpu[2].launch_coop(sort, {{grid_size, 2}, SORT_BLOCKDIM, shared_sz}, d_digits, len, win, d_temps, d_hist, - wbits - 1, wbits - 1, win == nwins - 2 ? top - 1 : wbits - 1); + wbits - 1, wbits - 1, win == nwins - 2 ? top - 1 : wbits - 1, + d_pidx); } if (win < nwins) { gpu[2].launch_coop(sort, {{grid_size, 1}, SORT_BLOCKDIM, shared_sz}, d_digits, len, win, d_temps, d_hist, - wbits - 1, top - 1, 0u); + wbits - 1, top - 1, 0u, d_pidx); } #endif } public: - RustError invoke(point_t &out, const affine_t *points_, size_t npoints, - const scalar_t *scalars, bool mont = true, + RustError invoke(point_t &out, const affine_t *points, size_t npoints, + const scalar_t *scalars, uint32_t pidx[], bool mont = true, size_t ffi_affine_sz = sizeof(affine_t)) { assert(this->npoints == 0 || npoints <= this->npoints); @@ -522,25 +525,29 @@ public: { // |scalars| being nullptr means the scalars are pre-loaded to // |d_scalars|, otherwise allocate stride. - size_t temp_sz = scalars ? sizeof(scalar_t) : 0; - temp_sz = stride * std::max(2 * sizeof(uint2), temp_sz); + size_t scalars_sz = scalars ? sizeof(scalar_t) : 0; + scalars_sz = stride * std::max(2 * sizeof(uint2), scalars_sz); + + size_t pidx_sz = pidx ? sizeof(uint32_t) : 0; + pidx_sz *= stride; // |points| being nullptr means the points are pre-loaded to // |d_points|, otherwise allocate double-stride. - const char *points = reinterpret_cast(points_); - size_t d_point_sz = points ? (batch > 1 ? 2 * stride : stride) : 0; - d_point_sz *= sizeof(affine_h); + size_t points_sz = points ? (batch > 1 ? 2 * stride : stride) : 0; + points_sz *= sizeof(affine_h); size_t digits_sz = nwins * stride * sizeof(uint32_t); - dev_ptr_t d_temp{temp_sz + digits_sz + d_point_sz, gpu[2]}; + dev_ptr_t d_temp{scalars_sz + pidx_sz + digits_sz + points_sz, gpu[2]}; vec2d_t d_temps{&d_temp[0], stride}; - vec2d_t d_digits{&d_temp[temp_sz], stride}; + vec2d_t d_digits{&d_temp[scalars_sz + pidx_sz], stride}; scalar_t *d_scalars = scalars ? (scalar_t *)&d_temp[0] : this->d_scalars; - affine_h *d_points = points ? (affine_h *)&d_temp[temp_sz + digits_sz] + uint32_t *d_pidx = pidx ? (uint32_t *)&d_temp[scalars_sz] + : this->d_pidx; + affine_h *d_points = points ? (affine_h *)&d_temp[scalars_sz + pidx_sz + digits_sz] : this->d_points; size_t d_off = 0; // device offset @@ -550,12 +557,16 @@ public: if (scalars) gpu[2].HtoD(&d_scalars[d_off], &scalars[h_off], num); - digits(&d_scalars[0], num, d_digits, d_temps, mont); + if (pidx) + gpu[2].HtoD(&d_pidx[d_off], &pidx[d_off], num); + digits(&d_scalars[0], num, d_digits, d_temps, mont, d_pidx); gpu[2].record(ev); - if (points) + if (points) { gpu[0].HtoD(&d_points[d_off], &points[h_off], num, ffi_affine_sz); + CUDA_OK(cudaGetLastError()); + } for (uint32_t i = 0; i < batch; i++) { @@ -569,7 +580,8 @@ public: gpu[i & 1].launch_coop(accumulate, {gpu.sm_count(), 0}, - d_buckets, nwins, wbits, &d_points[d_off], d_digits, d_hist, i & 1); + d_buckets, nwins, wbits, &d_points[pidx ? 0 : d_off], d_digits, d_hist, i & 1); + CUDA_OK(cudaGetLastError()); gpu[i & 1].record(ev); integrate<< scalars, bool mont = true) + RustError invoke(point_t &out, vec_t scalars, uint32_t pidx[], bool mont = true) { - return invoke(out, nullptr, scalars.size(), scalars, mont); + return invoke(out, nullptr, scalars.size(), scalars, pidx, mont); } RustError invoke(point_t &out, vec_t points, - const scalar_t *scalars, bool mont = true, + const scalar_t *scalars, uint32_t pidx[], bool mont = true, size_t ffi_affine_sz = sizeof(affine_t)) { - return invoke(out, points, points.size(), scalars, mont, ffi_affine_sz); + return invoke(out, points, points.size(), scalars, pidx, mont, ffi_affine_sz); } RustError invoke(point_t &out, vec_t points, - vec_t scalars, bool mont = true, + vec_t scalars, uint32_t pidx[], bool mont = true, size_t ffi_affine_sz = sizeof(affine_t)) { - return invoke(out, points, points.size(), scalars, mont, ffi_affine_sz); + return invoke(out, points, points.size(), scalars, pidx, mont, ffi_affine_sz); } RustError invoke(point_t &out, const std::vector &points, - const std::vector &scalars, bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) + const std::vector &scalars, uint32_t pidx[], + bool mont = true, size_t ffi_affine_sz = sizeof(affine_t)) { return invoke(out, points.data(), std::min(points.size(), scalars.size()), - scalars.data(), mont, ffi_affine_sz); + scalars.data(), nullptr, mont, ffi_affine_sz); } private: @@ -825,7 +838,7 @@ static RustError mult_pippenger(point_t *out, const affine_t points[], size_t np { msm_t msm{nullptr, npoints, true}; return msm.invoke(*out, slice_t{points, npoints}, - scalars, mont, ffi_affine_sz); + scalars, nullptr, mont, ffi_affine_sz); } catch (const cuda_error &e) { @@ -842,15 +855,15 @@ template static RustError mult_pippenger_with(point_t *out, msm_context_t *msm_context, size_t npoints, - const scalar_t scalars[], bool mont = true, + const scalar_t scalars[], uint32_t pidx[], bool mont = true, size_t ffi_affine_sz = sizeof(affine_t)) { try { msm_t msm{nullptr, npoints, false}; msm.set_d_points(msm_context->d_points); - return msm.invoke(*out, nullptr, npoints, - scalars, mont, ffi_affine_sz); + return msm.invoke(*out, nullptr, npoints, scalars, pidx, + mont, ffi_affine_sz); } catch (const cuda_error &e) { diff --git a/msm/sort.cuh b/msm/sort.cuh index 0b2928c..8e77f37 100644 --- a/msm/sort.cuh +++ b/msm/sort.cuh @@ -11,34 +11,35 @@ #define SORT_BLOCKDIM 1024 #ifndef DIGIT_BITS -# define DIGIT_BITS 13 +#define DIGIT_BITS 13 #endif #if DIGIT_BITS < 10 || DIGIT_BITS > 14 -# error "impossible DIGIT_BITS" +#error "impossible DIGIT_BITS" #endif __launch_bounds__(SORT_BLOCKDIM) -__global__ void sort(vec2d_t inouts, size_t len, uint32_t win, - vec2d_t temps, vec2d_t histograms, - uint32_t wbits, uint32_t lsbits0, uint32_t lsbits1); + __global__ void sort(vec2d_t inouts, size_t len, uint32_t win, + vec2d_t temps, vec2d_t histograms, + uint32_t wbits, uint32_t lsbits0, uint32_t lsbits1, uint32_t *d_pidx); #ifndef __MSM_SORT_DONT_IMPLEMENT__ #ifndef WARP_SZ -# define WARP_SZ 32 +#define WARP_SZ 32 #endif #ifdef __GNUC__ -# define asm __asm__ __volatile__ +#define asm __asm__ __volatile__ #else -# define asm asm volatile +#define asm asm volatile #endif -static const uint32_t N_COUNTERS = 1<= 12 - #pragma unroll - for (uint32_t i = 0; i < N_SUMS/4; i++) - ((uint4*)counters)[threadIdx.x + i*SORT_BLOCKDIM] = uint4{0, 0, 0, 0}; +#pragma unroll + for (uint32_t i = 0; i < N_SUMS / 4; i++) + ((uint4 *)counters)[threadIdx.x + i * SORT_BLOCKDIM] = uint4{0, 0, 0, 0}; #else - #pragma unroll +#pragma unroll for (uint32_t i = 0; i < N_SUMS; i++) - counters[threadIdx.x + i*SORT_BLOCKDIM] = 0; + counters[threadIdx.x + i * SORT_BLOCKDIM] = 0; #endif __syncthreads(); } -__device__ __forceinline__ -void count_digits(const uint32_t src[], uint32_t base, uint32_t len, - uint32_t lshift, uint32_t rshift, uint32_t mask) +__device__ __forceinline__ void upper_count_digits(const uint32_t src[], uint32_t base, uint32_t len, + uint32_t lshift, uint32_t rshift, uint32_t mask) { zero_counters(); @@ -87,9 +87,10 @@ void count_digits(const uint32_t src[], uint32_t base, uint32_t len, src += base; // count occurrences of each non-zero digit - for (uint32_t i = threadIdx.x; i < len; i += SORT_BLOCKDIM) { + for (uint32_t i = threadIdx.x; i < len; i += SORT_BLOCKDIM) + { auto val = src[(size_t)i]; - auto pck = pack(base+i, pack_mask, (val-1) << lshift); + auto pck = pack(base + i, pack_mask, (val - 1) << lshift); if (val) (void)atomicAdd(&counters[(pck >> rshift) & mask], 1); } @@ -97,36 +98,36 @@ void count_digits(const uint32_t src[], uint32_t base, uint32_t len, __syncthreads(); } -__device__ __forceinline__ -void scatter(uint2 dst[], const uint32_t src[], uint32_t base, uint32_t len, - uint32_t lshift, uint32_t rshift, uint32_t mask, - uint32_t pidx[] = nullptr) +__device__ __forceinline__ void upper_scatter(uint2 dst[], const uint32_t src[], uint32_t base, uint32_t len, + uint32_t lshift, uint32_t rshift, uint32_t mask, + uint32_t pidx[] = nullptr) { const uint32_t pack_mask = 0xffffffffU << lshift; src += base; - #pragma unroll 1 // the subroutine is memory-io-bound, unrolling makes no difference - for (uint32_t i = threadIdx.x; i < len; i += SORT_BLOCKDIM) { +#pragma unroll 1 // the subroutine is memory-io-bound, unrolling makes no difference + for (uint32_t i = threadIdx.x; i < len; i += SORT_BLOCKDIM) + { auto val = src[(size_t)i]; - auto pck = pack(base+i, pack_mask, (val-1) << lshift); - if (val) { + auto pck = pack(base + i, pack_mask, (val - 1) << lshift); + if (val) + { uint32_t idx = atomicSub(&counters[(pck >> rshift) & mask], 1) - 1; - uint32_t pid = pidx ? pidx[base+i] : base+i; + uint32_t pid = pidx ? pidx[base + i] : base + i; dst[idx] = uint2{pck, pack(pid, 0x80000000, val)}; } } } -__device__ -static void upper_sort(uint2 dst[], const uint32_t src[], uint32_t len, - uint32_t lsbits, uint32_t bits, uint32_t digit, - uint32_t histogram[]) +__device__ static void upper_sort(uint2 dst[], const uint32_t src[], uint32_t len, + uint32_t lsbits, uint32_t bits, uint32_t digit, + uint32_t histogram[], uint32_t pidx[] = nullptr) { uint32_t grid_div = 31 - __clz(gridDim.x); - uint32_t grid_rem = (1<> grid_div; // / gridDim.x; - uint32_t rem = len & grid_rem; // % gridDim.x; + uint32_t slice = len >> grid_div; // / gridDim.x; + uint32_t rem = len & grid_rem; // % gridDim.x; uint32_t base; if (blockIdx.x < rem) @@ -134,18 +135,18 @@ static void upper_sort(uint2 dst[], const uint32_t src[], uint32_t len, else base = slice * blockIdx.x + rem; - const uint32_t mask = (1<> grid_div; // / gridDim.x; uint2 h = uint2{0, 0}; - uint32_t sum, warp_off = warpid*WARP_SZ*N_SUMS + sub_warpid; + uint32_t sum, warp_off = warpid * WARP_SZ * N_SUMS + sub_warpid; - #pragma unroll 1 - for (uint32_t i = 0; i < WARP_SZ*N_SUMS; i += stride, warp_off += stride) { - auto* hptr = &histogram[warp_off << digit]; +#pragma unroll 1 + for (uint32_t i = 0; i < WARP_SZ * N_SUMS; i += stride, warp_off += stride) + { + auto *hptr = &histogram[warp_off << digit]; - sum = (warp_off < 1< 0) - sum += __shfl_sync(0xffffffff, prefix_sums[i-1], WARP_SZ-1); + sum += __shfl_sync(0xffffffff, prefix_sums[i - 1], WARP_SZ - 1); prefix_sums[i] = sum; } // carry over most significant prefix sums from each warp - if (laneid == WARP_SZ-1) - counters[warpid*(WARP_SZ*N_SUMS+1)] = prefix_sums[N_SUMS-1]; + if (laneid == WARP_SZ - 1) + counters[warpid * (WARP_SZ * N_SUMS + 1)] = prefix_sums[N_SUMS - 1]; __syncthreads(); - uint32_t carry_sum = laneid ? counters[(laneid-1)*(WARP_SZ*N_SUMS+1)] : 0; + uint32_t carry_sum = laneid ? counters[(laneid - 1) * (WARP_SZ * N_SUMS + 1)] : 0; __syncthreads(); - carry_sum = sum_up(carry_sum, SORT_BLOCKDIM/WARP_SZ); + carry_sum = sum_up(carry_sum, SORT_BLOCKDIM / WARP_SZ); carry_sum = __shfl_sync(0xffffffff, carry_sum, warpid); carry_sum += base; - #pragma unroll +#pragma unroll for (uint32_t i = 0; i < N_SUMS; i++) - counters[lane_off + i*WARP_SZ] = prefix_sums[i] += carry_sum; + counters[lane_off + i * WARP_SZ] = prefix_sums[i] += carry_sum; - // store the prefix sums to histogram[] - #pragma unroll - for (uint32_t i = 0; i < N_SUMS; i++, lane_off += WARP_SZ) { - if (lane_off < 1< DIGIT_BITS || (lg_gridDim && wbits > lg_gridDim+1)) { + if (wbits > DIGIT_BITS || (lg_gridDim && wbits > lg_gridDim + 1)) + { uint32_t top_bits = wbits / 2; uint32_t low_bits = wbits - top_bits; - if (low_bits < lg_gridDim+1) { - low_bits = lg_gridDim+1; + if (low_bits < lg_gridDim + 1) + { + low_bits = lg_gridDim + 1; top_bits = wbits - low_bits; } - upper_sort(temp, inout, len, lsbits, top_bits, low_bits, histogram); + upper_sort(temp, inout, len, lsbits, top_bits, low_bits, histogram, pidx); - histogram += blockIdx.x< inouts, size_t len, uint32_t win, - vec2d_t temps, vec2d_t histograms, - uint32_t wbits, uint32_t lsbits0, uint32_t lsbits1) + __global__ void sort(vec2d_t inouts, size_t len, uint32_t win, + vec2d_t temps, vec2d_t histograms, + uint32_t wbits, uint32_t lsbits0, uint32_t lsbits1, + uint32_t pidx[] = nullptr) { win += blockIdx.y; sort_row(inouts[win], len, temps[blockIdx.y], histograms[win], - wbits, blockIdx.y==0 ? lsbits0 : lsbits1); + wbits, blockIdx.y == 0 ? lsbits0 : lsbits1, pidx); } -# undef asm +#undef asm #endif #endif diff --git a/poc/msm-cuda/benches/msm.rs b/poc/msm-cuda/benches/msm.rs index 35fbfa5..2597065 100644 --- a/poc/msm-cuda/benches/msm.rs +++ b/poc/msm-cuda/benches/msm.rs @@ -29,11 +29,14 @@ fn criterion_benchmark(c: &mut Criterion) { let name = format!("2**{}", npoints_npow); group.bench_function(name, |b| { b.iter(|| { - let _ = multi_scalar_mult_arkworks(&points.as_slice(), unsafe { - std::mem::transmute::<&[_], &[BigInteger256]>( - scalars.as_slice(), - ) - }); + let _ = multi_scalar_mult_arkworks( + &points.as_slice(), + unsafe { + std::mem::transmute::<&[_], &[BigInteger256]>( + scalars.as_slice(), + ) + }, + ); }) }); @@ -57,11 +60,12 @@ fn criterion_benchmark_fp2(c: &mut Criterion) { let name = format!("2**{}", npoints_npow); group.bench_function(name, |b| { b.iter(|| { - let _ = multi_scalar_mult_fp2_arkworks(&points.as_slice(), unsafe { - std::mem::transmute::<&[_], &[BigInteger256]>( - scalars.as_slice(), - ) - }); + let _ = + multi_scalar_mult_fp2_arkworks(&points.as_slice(), unsafe { + std::mem::transmute::<&[_], &[BigInteger256]>( + scalars.as_slice(), + ) + }); }) }); diff --git a/poc/msm-cuda/cuda/pippenger.cu b/poc/msm-cuda/cuda/pippenger.cu index 94ecb7f..a98680e 100644 --- a/poc/msm-cuda/cuda/pippenger.cu +++ b/poc/msm-cuda/cuda/pippenger.cu @@ -19,8 +19,8 @@ typedef fr_t scalar_t; #ifndef __CUDA_ARCH__ extern "C" RustError mult_pippenger(point_t* out, const affine_t points[], size_t npoints, - const scalar_t scalars[]) + const scalar_t scalars[], uint32_t pidx[]) { - return mult_pippenger(out, points, npoints, scalars, false); + return mult_pippenger(out, points, npoints, scalars, pidx, false); } #endif diff --git a/poc/msm-cuda/examples/msm.rs b/poc/msm-cuda/examples/msm.rs new file mode 100644 index 0000000..8b41491 --- /dev/null +++ b/poc/msm-cuda/examples/msm.rs @@ -0,0 +1,40 @@ +#[cfg(feature = "bls12_377")] +use ark_bls12_377::{G1Affine, G2Affine}; +#[cfg(feature = "bls12_381")] +use ark_bls12_381::{G1Affine, G2Affine}; +#[cfg(feature = "bn254")] +use ark_bn254::G1Affine; +use ark_ff::BigInteger256; + +use std::str::FromStr; + +use msm_cuda::*; + +/// cargo run --release --example msm --features bn254 +fn main() { + let bench_npow = std::env::var("BENCH_NPOW").unwrap_or("16".to_string()); + let npoints_npow = i32::from_str(&bench_npow).unwrap(); + + let (points, mut scalars) = + util::generate_points_scalars::(1usize << npoints_npow); + + let ret = multi_scalar_mult_arkworks( + &points.as_slice(), + unsafe { + std::mem::transmute::<&[_], &[BigInteger256]>(scalars.as_slice()) + }, + &(0..1u32 << npoints_npow).rev().collect::>(), + ); + + scalars.reverse(); + let ret_rev = multi_scalar_mult_arkworks( + &points.as_slice(), + unsafe { + std::mem::transmute::<&[_], &[BigInteger256]>(scalars.as_slice()) + }, + &(0..1u32 << npoints_npow).collect::>(), + ); + + assert_eq!(ret, ret_rev); + println!("success") +}