diff --git a/msm/pippenger.cuh b/msm/pippenger.cuh index 61d198e..91d3bf3 100644 --- a/msm/pippenger.cuh +++ b/msm/pippenger.cuh @@ -16,12 +16,12 @@ #include "batch_addition.cuh" #ifndef WARP_SZ -# define WARP_SZ 32 +#define WARP_SZ 32 #endif #ifdef __GNUC__ -# define asm __asm__ __volatile__ +#define asm __asm__ __volatile__ #else -# define asm asm volatile +#define asm asm volatile #endif /* @@ -30,71 +30,74 @@ #ifdef __CUDA_ARCH__ // Transposed scalar_t -template -class scalar_T { - uint32_t val[sizeof(scalar_t)/sizeof(uint32_t)][WARP_SZ]; +template +class scalar_T +{ + uint32_t val[sizeof(scalar_t) / sizeof(uint32_t)][WARP_SZ]; public: - __device__ const uint32_t& operator[](size_t i) const { return val[i][0]; } - __device__ scalar_T& operator()(uint32_t laneid) - { return *reinterpret_cast(&val[0][laneid]); } - __device__ scalar_T& operator=(const scalar_t& rhs) + __device__ const uint32_t &operator[](size_t i) const { return val[i][0]; } + __device__ scalar_T &operator()(uint32_t laneid) + { + return *reinterpret_cast(&val[0][laneid]); + } + __device__ scalar_T &operator=(const scalar_t &rhs) { - for (size_t i = 0; i < sizeof(scalar_t)/sizeof(uint32_t); i++) + for (size_t i = 0; i < sizeof(scalar_t) / sizeof(uint32_t); i++) val[i][0] = rhs[i]; return *this; } }; -template -__device__ __forceinline__ -static uint32_t get_wval(const scalar_T& scalar, uint32_t off, - uint32_t top_i = (scalar_t::nbits + 31) / 32 - 1) +template +__device__ __forceinline__ static uint32_t get_wval(const scalar_T &scalar, uint32_t off, + uint32_t top_i = (scalar_t::nbits + 31) / 32 - 1) { uint32_t i = off / 32; uint64_t ret = scalar[i]; if (i < top_i) - ret |= (uint64_t)scalar[i+1] << 32; + ret |= (uint64_t)scalar[i + 1] << 32; - return ret >> (off%32); + return ret >> (off % 32); } -__device__ __forceinline__ -static uint32_t booth_encode(uint32_t wval, uint32_t wmask, uint32_t wbits) +__device__ __forceinline__ static uint32_t booth_encode(uint32_t wval, uint32_t wmask, uint32_t wbits) { uint32_t sign = (wval >> wbits) & 1; wval = ((wval + 1) & wmask) >> 1; - return sign ? 0-wval : wval; + return sign ? 0 - wval : wval; } #endif -template +template __launch_bounds__(1024) __global__ -void breakdown(vec2d_t digits, const scalar_t scalars[], size_t len, - uint32_t nwins, uint32_t wbits, bool mont = true) + void breakdown(vec2d_t digits, const scalar_t scalars[], size_t len, + uint32_t nwins, uint32_t wbits, bool mont = true) { - assert(len <= (1U<<31) && wbits < 32); + assert(len <= (1U << 31) && wbits < 32); #ifdef __CUDA_ARCH__ extern __shared__ scalar_T xchange[]; const uint32_t tid = threadIdx.x; - const uint32_t tix = threadIdx.x + blockIdx.x*blockDim.x; + const uint32_t tix = threadIdx.x + blockIdx.x * blockDim.x; const uint32_t top_i = (scalar_t::nbits + 31) / 32 - 1; - const uint32_t wmask = 0xffffffffU >> (31-wbits); // (1U << (wbits+1)) - 1; + const uint32_t wmask = 0xffffffffU >> (31 - wbits); // (1U << (wbits+1)) - 1; - auto& scalar = xchange[tid/WARP_SZ](tid%WARP_SZ); + auto &scalar = xchange[tid / WARP_SZ](tid % WARP_SZ); - #pragma unroll 1 - for (uint32_t i = tix; i < (uint32_t)len; i += gridDim.x*blockDim.x) { +#pragma unroll 1 + for (uint32_t i = tix; i < (uint32_t)len; i += gridDim.x * blockDim.x) + { auto s = scalars[i]; #if 0 s.from(); if (!mont) s.to(); #else - if (mont) s.from(); + if (mont) + s.from(); #endif // clear the most significant bit @@ -104,59 +107,62 @@ void breakdown(vec2d_t digits, const scalar_t scalars[], size_t len, scalar = s; - #pragma unroll 1 - for (uint32_t bit0 = nwins*wbits - 1, win = nwins; --win;) { +#pragma unroll 1 + for (uint32_t bit0 = nwins * wbits - 1, win = nwins; --win;) + { bit0 -= wbits; uint32_t wval = get_wval(scalar, bit0, top_i); wval = booth_encode(wval, wmask, wbits); - if (wval) wval ^= msb; + if (wval) + wval ^= msb; digits[win][i] = wval; } uint32_t wval = s[0] << 1; wval = booth_encode(wval, wmask, wbits); - if (wval) wval ^= msb; + if (wval) + wval ^= msb; digits[0][i] = wval; } #endif } #ifndef LARGE_L1_CODE_CACHE -# if __CUDA_ARCH__-0 >= 800 -# define LARGE_L1_CODE_CACHE 1 -# define ACCUMULATE_NTHREADS 384 -# else -# define LARGE_L1_CODE_CACHE 0 -# define ACCUMULATE_NTHREADS (bucket_t::degree == 1 ? 384 : 256) -# endif +#if __CUDA_ARCH__ - 0 >= 800 +#define LARGE_L1_CODE_CACHE 1 +#define ACCUMULATE_NTHREADS 384 +#else +#define LARGE_L1_CODE_CACHE 0 +#define ACCUMULATE_NTHREADS (bucket_t::degree == 1 ? 384 : 256) +#endif #endif #ifndef MSM_NTHREADS -# define MSM_NTHREADS 256 +#define MSM_NTHREADS 256 #endif -#if MSM_NTHREADS < 32 || (MSM_NTHREADS & (MSM_NTHREADS-1)) != 0 -# error "bad MSM_NTHREADS value" +#if MSM_NTHREADS < 32 || (MSM_NTHREADS & (MSM_NTHREADS - 1)) != 0 +#error "bad MSM_NTHREADS value" #endif #ifndef MSM_NSTREAMS -# define MSM_NSTREAMS 8 -#elif MSM_NSTREAMS<2 -# error "invalid MSM_NSTREAMS" +#define MSM_NSTREAMS 8 +#elif MSM_NSTREAMS < 2 +#error "invalid MSM_NSTREAMS" #endif -template +template __launch_bounds__(ACCUMULATE_NTHREADS) __global__ -void accumulate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, - /*const*/ affine_h points_[], const vec2d_t digits, - const vec2d_t histogram, uint32_t sid = 0) + void accumulate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, + /*const*/ affine_h points_[], const vec2d_t digits, + const vec2d_t histogram, uint32_t sid = 0) { - vec2d_t buckets{buckets_, 1U<<--wbits}; - const affine_h* points = points_; + vec2d_t buckets{buckets_, 1U << --wbits}; + const affine_h *points = points_; static __device__ uint32_t streams[MSM_NSTREAMS]; - uint32_t& current = streams[sid % MSM_NSTREAMS]; + uint32_t ¤t = streams[sid % MSM_NSTREAMS]; uint32_t laneid; asm("mov.u32 %0, %laneid;" : "=r"(laneid)); const uint32_t degree = bucket_t::degree; @@ -168,18 +174,19 @@ void accumulate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, __shared__ uint32_t xchg; if (threadIdx.x == 0) - xchg = atomicAdd(¤t, blockDim.x/degree); + xchg = atomicAdd(¤t, blockDim.x / degree); __syncthreads(); - x = xchg + threadIdx.x/degree; + x = xchg + threadIdx.x / degree; #else x = laneid == 0 ? atomicAdd(¤t, warp_sz) : 0; x = __shfl_sync(0xffffffff, x, 0) + lane_id; #endif - while (x < (nwins << wbits)) { + while (x < (nwins << wbits)) + { y = x >> wbits; x &= (1U << wbits) - 1; - const uint32_t* h = &histogram[y][x]; + const uint32_t *h = &histogram[y][x]; uint32_t idx, len = h[0]; @@ -191,15 +198,17 @@ void accumulate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, if (lane_id == 0 && x != 0) idx = h[-1]; - if ((len -= idx) && !(x == 0 && y == 0)) { - const uint32_t* digs_ptr = &digits[y][idx]; + if ((len -= idx) && !(x == 0 && y == 0)) + { + const uint32_t *digs_ptr = &digits[y][idx]; uint32_t digit = *digs_ptr++; affine_t p = points[digit & 0x7fffffff]; bucket_t bucket = p; bucket.cneg(digit >> 31); - while (--len) { + while (--len) + { digit = *digs_ptr++; p = points[digit & 0x7fffffff]; if (sizeof(bucket) <= 128 || LARGE_L1_CODE_CACHE) @@ -209,7 +218,9 @@ void accumulate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, } buckets[y][x] = bucket; - } else { + } + else + { buckets[y][x].inf(); } @@ -223,68 +234,89 @@ void accumulate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, current = 0; } -template +template __launch_bounds__(256) __global__ -void integrate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, uint32_t nbits) + void integrate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, uint32_t nbits) { const uint32_t degree = bucket_t::degree; uint32_t Nthrbits = 31 - __clz(blockDim.x / degree); - assert((blockDim.x & (blockDim.x-1)) == 0 && wbits-1 > Nthrbits); + assert((blockDim.x & (blockDim.x - 1)) == 0 && wbits - 1 > Nthrbits); - vec2d_t buckets{buckets_, 1U<<(wbits-1)}; + vec2d_t buckets{buckets_, 1U << (wbits - 1)}; extern __shared__ uint4 scratch_[]; - auto* scratch = reinterpret_cast(scratch_); + auto *scratch = reinterpret_cast(scratch_); const uint32_t tid = threadIdx.x / degree; const uint32_t bid = blockIdx.x; - auto* row = &buckets[bid][0]; - uint32_t i = 1U << (wbits-1-Nthrbits); + auto *row = &buckets[bid][0]; + uint32_t i = 1U << (wbits - 1 - Nthrbits); row += tid * i; uint32_t mask = 0; - if ((bid+1)*wbits > nbits) { - uint32_t lsbits = nbits - bid*wbits; - mask = (1U << (wbits-lsbits)) - 1; + if ((bid + 1) * wbits > nbits) + { + uint32_t lsbits = nbits - bid * wbits; + mask = (1U << (wbits - lsbits)) - 1; } bucket_t res, acc = row[--i]; - if (i & mask) { - if (sizeof(res) <= 128) res.inf(); - else scratch[tid].inf(); - } else { - if (sizeof(res) <= 128) res = acc; - else scratch[tid] = acc; + if (i & mask) + { + if (sizeof(res) <= 128) + res.inf(); + else + scratch[tid].inf(); + } + else + { + if (sizeof(res) <= 128) + res = acc; + else + scratch[tid] = acc; } bucket_t p; - #pragma unroll 1 - while (i--) { +#pragma unroll 1 + while (i--) + { p = row[i]; uint32_t pc = i & mask ? 2 : 0; - #pragma unroll 1 - do { - if (sizeof(bucket_t) <= 128) { +#pragma unroll 1 + do + { + if (sizeof(bucket_t) <= 128) + { p.add(acc); - if (pc == 1) { + if (pc == 1) + { res = p; - } else { + } + else + { acc = p; - if (pc == 0) p = res; + if (pc == 0) + p = res; } - } else { + } + else + { if (LARGE_L1_CODE_CACHE && degree == 1) p.add(acc); else p.uadd(acc); - if (pc == 1) { + if (pc == 1) + { scratch[tid] = p; - } else { + } + else + { acc = p; - if (pc == 0) p = scratch[tid]; + if (pc == 0) + p = scratch[tid]; } } } while (++pc < 2); @@ -292,29 +324,25 @@ void integrate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, uint32_t nbi __syncthreads(); - buckets[bid][2*tid] = p; - buckets[bid][2*tid+1] = acc; + buckets[bid][2 * tid] = p; + buckets[bid][2 * tid + 1] = acc; } #undef asm #ifndef SPPARK_DONT_INSTANTIATE_TEMPLATES -template __global__ -void accumulate(bucket_t::mem_t buckets_[], - uint32_t nwins, uint32_t wbits, - /*const*/ affine_t::mem_t points_[], - const vec2d_t digits, - const vec2d_t histogram, - uint32_t sid); -template __global__ -void batch_addition(bucket_t::mem_t buckets[], - const affine_t::mem_t points[], size_t npoints, - const uint32_t digits[], const uint32_t& ndigits); -template __global__ -void integrate(bucket_t::mem_t buckets_[], uint32_t nwins, - uint32_t wbits, uint32_t nbits); -template __global__ -void breakdown(vec2d_t digits, const scalar_t scalars[], - size_t len, uint32_t nwins, uint32_t wbits, bool mont); +template __global__ void accumulate(bucket_t::mem_t buckets_[], + uint32_t nwins, uint32_t wbits, + /*const*/ affine_t::mem_t points_[], + const vec2d_t digits, + const vec2d_t histogram, + uint32_t sid); +template __global__ void batch_addition(bucket_t::mem_t buckets[], + const affine_t::mem_t points[], size_t npoints, + const uint32_t digits[], const uint32_t &ndigits); +template __global__ void integrate(bucket_t::mem_t buckets_[], uint32_t nwins, + uint32_t wbits, uint32_t nbits); +template __global__ void breakdown(vec2d_t digits, const scalar_t scalars[], + size_t len, uint32_t nwins, uint32_t wbits, bool mont); #endif #include @@ -323,11 +351,12 @@ void breakdown(vec2d_t digits, const scalar_t scalars[], #include #include -template -class msm_t { - const gpu_t& gpu; +template +class msm_t +{ + const gpu_t &gpu; size_t npoints; uint32_t wbits, nwins; bucket_h *d_buckets; @@ -335,69 +364,80 @@ class msm_t { scalar_t *d_scalars; vec2d_t d_hist; - template using vec_t = slice_t; + template + using vec_t = slice_t; + + class result_t + { + bucket_t ret[MSM_NTHREADS / bucket_t::degree][2]; - class result_t { - bucket_t ret[MSM_NTHREADS/bucket_t::degree][2]; public: result_t() {} - inline operator decltype(ret)&() { return ret; } - inline const bucket_t* operator[](size_t i) const { return ret[i]; } + inline operator decltype(ret) & () { return ret; } + inline const bucket_t *operator[](size_t i) const { return ret[i]; } }; constexpr static int lg2(size_t n) - { int ret=0; while (n>>=1) ret++; return ret; } + { + int ret = 0; + while (n >>= 1) + ret++; + return ret; + } public: msm_t(const affine_t points[], size_t np, size_t ffi_affine_sz = sizeof(affine_t), int device_id = -1) : gpu(select_gpu(device_id)), d_points(nullptr), d_scalars(nullptr) { - npoints = (np+WARP_SZ-1) & ((size_t)0-WARP_SZ); + npoints = (np + WARP_SZ - 1) & ((size_t)0 - WARP_SZ); wbits = 17; - if (npoints > 192) { - wbits = std::min(lg2(npoints + npoints/2) - 8, 18); + if (npoints > 192) + { + wbits = std::min(lg2(npoints + npoints / 2) - 8, 18); if (wbits < 10) wbits = 10; - } else if (npoints > 0) { + } + else if (npoints > 0) + { wbits = 10; } nwins = (scalar_t::bit_length() - 1) / wbits + 1; - uint32_t row_sz = 1U << (wbits-1); + uint32_t row_sz = 1U << (wbits - 1); - size_t d_buckets_sz = (nwins * row_sz) - + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); - size_t d_blob_sz = (d_buckets_sz * sizeof(d_buckets[0])) - + (nwins * row_sz * sizeof(uint32_t)) - + (points ? npoints * sizeof(d_points[0]) : 0); + size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); + size_t d_blob_sz = (d_buckets_sz * sizeof(d_buckets[0])) + (nwins * row_sz * sizeof(uint32_t)) + (points ? npoints * sizeof(d_points[0]) : 0); d_buckets = reinterpret_cast(gpu.Dmalloc(d_blob_sz)); d_hist = vec2d_t(&d_buckets[d_buckets_sz], row_sz); - if (points) { + if (points) + { d_points = reinterpret_cast(d_hist[nwins]); gpu.HtoD(d_points, points, np, ffi_affine_sz); npoints = np; - } else { + } + else + { npoints = 0; } - } inline msm_t(vec_t points, size_t ffi_affine_sz = sizeof(affine_t), int device_id = -1) - : msm_t(points, points.size(), ffi_affine_sz, device_id) {}; + : msm_t(points, points.size(), ffi_affine_sz, device_id){}; inline msm_t(int device_id = -1) - : msm_t(nullptr, 0, 0, device_id) {}; + : msm_t(nullptr, 0, 0, device_id){}; ~msm_t() { gpu.sync(); - if (d_buckets) gpu.Dfree(d_buckets); + if (d_buckets) + gpu.Dfree(d_buckets); } private: void digits(const scalar_t d_scalars[], size_t len, - vec2d_t& d_digits, vec2d_t&d_temps, bool mont) + vec2d_t &d_digits, vec2d_t &d_temps, bool mont) { // Using larger grid size doesn't make 'sort' run faster, actually // quite contrary. Arguably because global memory bus gets @@ -410,9 +450,8 @@ private: while (grid_size & (grid_size - 1)) grid_size -= (grid_size & (0 - grid_size)); - breakdown<<<2*grid_size, 1024, sizeof(scalar_t)*1024, gpu[2]>>>( - d_digits, d_scalars, len, nwins, wbits, mont - ); + breakdown<<<2 * grid_size, 1024, sizeof(scalar_t) * 1024, gpu[2]>>>( + d_digits, d_scalars, len, nwins, wbits, mont); CUDA_OK(cudaGetLastError()); const size_t shared_sz = sizeof(uint32_t) << DIGIT_BITS; @@ -430,34 +469,36 @@ private: #else // On the other hand a pair of kernels launched in parallel run // ~50% slower but sort twice as much data... - uint32_t top = scalar_t::bit_length() - wbits * (nwins-1); + uint32_t top = scalar_t::bit_length() - wbits * (nwins - 1); uint32_t win; - for (win = 0; win < nwins-1; win += 2) { + for (win = 0; win < nwins - 1; win += 2) + { gpu[2].launch_coop(sort, {{grid_size, 2}, SORT_BLOCKDIM, shared_sz}, - d_digits, len, win, d_temps, d_hist, - wbits-1, wbits-1, win == nwins-2 ? top-1 : wbits-1); + d_digits, len, win, d_temps, d_hist, + wbits - 1, wbits - 1, win == nwins - 2 ? top - 1 : wbits - 1); } - if (win < nwins) { + if (win < nwins) + { gpu[2].launch_coop(sort, {{grid_size, 1}, SORT_BLOCKDIM, shared_sz}, - d_digits, len, win, d_temps, d_hist, - wbits-1, top-1, 0u); + d_digits, len, win, d_temps, d_hist, + wbits - 1, top - 1, 0u); } #endif } public: - RustError invoke(point_t& out, const affine_t* points_, size_t npoints, - const scalar_t* scalars, bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) + RustError invoke(point_t &out, const affine_t *points_, size_t npoints, + const scalar_t *scalars, bool mont = true, + size_t ffi_affine_sz = sizeof(affine_t)) { assert(this->npoints == 0 || npoints <= this->npoints); - uint32_t lg_npoints = lg2(npoints + npoints/2); + uint32_t lg_npoints = lg2(npoints + npoints / 2); size_t batch = 1 << (std::max(lg_npoints, wbits) - wbits); batch >>= 6; batch = batch ? batch : 1; uint32_t stride = (npoints + batch - 1) / batch; - stride = (stride+WARP_SZ-1) & ((size_t)0-WARP_SZ); + stride = (stride + WARP_SZ - 1) & ((size_t)0 - WARP_SZ); std::vector res(nwins); std::vector ones(gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ); @@ -465,16 +506,17 @@ public: out.inf(); point_t p; - try { + try + { // |scalars| being nullptr means the scalars are pre-loaded to // |d_scalars|, otherwise allocate stride. size_t temp_sz = scalars ? sizeof(scalar_t) : 0; - temp_sz = stride * std::max(2*sizeof(uint2), temp_sz); + temp_sz = stride * std::max(2 * sizeof(uint2), temp_sz); // |points| being nullptr means the points are pre-loaded to // |d_points|, otherwise allocate double-stride. - const char* points = reinterpret_cast(points_); - size_t d_point_sz = points ? (batch > 1 ? 2*stride : stride) : 0; + const char *points = reinterpret_cast(points_); + size_t d_point_sz = points ? (batch > 1 ? 2 * stride : stride) : 0; d_point_sz *= sizeof(affine_h); size_t digits_sz = nwins * stride * sizeof(uint32_t); @@ -484,13 +526,13 @@ public: vec2d_t d_temps{&d_temp[0], stride}; vec2d_t d_digits{&d_temp[temp_sz], stride}; - scalar_t* d_scalars = scalars ? (scalar_t*)&d_temp[0] + scalar_t *d_scalars = scalars ? (scalar_t *)&d_temp[0] : this->d_scalars; - affine_h* d_points = points ? (affine_h*)&d_temp[temp_sz + digits_sz] + affine_h *d_points = points ? (affine_h *)&d_temp[temp_sz + digits_sz] : this->d_points; - size_t d_off = 0; // device offset - size_t h_off = 0; // host offset + size_t d_off = 0; // device offset + size_t h_off = 0; // host offset size_t num = stride > npoints ? npoints : stride; event_t ev; @@ -501,32 +543,31 @@ public: if (points) gpu[0].HtoD(&d_points[d_off], &points[h_off], - num, ffi_affine_sz); + num, ffi_affine_sz); - for (uint32_t i = 0; i < batch; i++) { - gpu[i&1].wait(ev); + for (uint32_t i = 0; i < batch; i++) + { + gpu[i & 1].wait(ev); batch_addition<<>>( - &d_buckets[nwins << (wbits-1)], &d_points[d_off], num, - &d_digits[0][0], d_hist[0][0] - ); + 0, gpu[i & 1]>>>( + &d_buckets[nwins << (wbits - 1)], &d_points[d_off], num, + &d_digits[0][0], d_hist[0][0]); CUDA_OK(cudaGetLastError()); - gpu[i&1].launch_coop(accumulate, - {gpu.sm_count(), 0}, - d_buckets, nwins, wbits, &d_points[d_off], d_digits, d_hist, i&1 - ); - gpu[i&1].record(ev); + gpu[i & 1].launch_coop(accumulate, + {gpu.sm_count(), 0}, + d_buckets, nwins, wbits, &d_points[d_off], d_digits, d_hist, i & 1); + gpu[i & 1].record(ev); integrate<<>>( - d_buckets, nwins, wbits, scalar_t::bit_length() - ); + sizeof(bucket_t) * MSM_NTHREADS / bucket_t::degree, + gpu[i & 1]>>>( + d_buckets, nwins, wbits, scalar_t::bit_length()); CUDA_OK(cudaGetLastError()); - if (i < batch-1) { + if (i < batch - 1) + { h_off += stride; num = h_off + stride <= npoints ? stride : npoints - h_off; @@ -537,26 +578,32 @@ public: d_digits, d_temps, mont); gpu[2].record(ev); - if (points) { + if (points) + { size_t j = (i + 1) & 1; d_off = j ? stride : 0; - gpu[j].HtoD(&d_points[d_off], &points[h_off*ffi_affine_sz], - num, ffi_affine_sz); - } else { + gpu[j].HtoD(&d_points[d_off], &points[h_off * ffi_affine_sz], + num, ffi_affine_sz); + } + else + { d_off = h_off; } } - if (i > 0) { + if (i > 0) + { collect(p, res, ones); out.add(p); } - gpu[i&1].DtoH(ones, d_buckets + (nwins << (wbits-1))); - gpu[i&1].DtoH(res, d_buckets, sizeof(bucket_h)<<(wbits-1)); - gpu[i&1].sync(); + gpu[i & 1].DtoH(ones, d_buckets + (nwins << (wbits - 1))); + gpu[i & 1].DtoH(res, d_buckets, sizeof(bucket_h) << (wbits - 1)); + gpu[i & 1].sync(); } - } catch (const cuda_error& e) { + } + catch (const cuda_error &e) + { gpu.sync(); #ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE return RustError{e.code(), e.what()}; @@ -571,53 +618,63 @@ public: return RustError{cudaSuccess}; } - RustError invoke(point_t& out, const affine_t* points, size_t npoints, - gpu_ptr_t scalars, bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) + RustError invoke(point_t &out, const affine_t *points, size_t npoints, + gpu_ptr_t scalars, bool mont = true, + size_t ffi_affine_sz = sizeof(affine_t)) { d_scalars = scalars; return invoke(out, points, npoints, nullptr, mont, ffi_affine_sz); } - RustError invoke(point_t& out, vec_t scalars, bool mont = true) - { return invoke(out, nullptr, scalars.size(), scalars, mont); } + RustError invoke(point_t &out, vec_t scalars, bool mont = true) + { + return invoke(out, nullptr, scalars.size(), scalars, mont); + } - RustError invoke(point_t& out, vec_t points, - const scalar_t* scalars, bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) - { return invoke(out, points, points.size(), scalars, mont, ffi_affine_sz); } + RustError invoke(point_t &out, vec_t points, + const scalar_t *scalars, bool mont = true, + size_t ffi_affine_sz = sizeof(affine_t)) + { + return invoke(out, points, points.size(), scalars, mont, ffi_affine_sz); + } - RustError invoke(point_t& out, vec_t points, - vec_t scalars, bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) - { return invoke(out, points, points.size(), scalars, mont, ffi_affine_sz); } + RustError invoke(point_t &out, vec_t points, + vec_t scalars, bool mont = true, + size_t ffi_affine_sz = sizeof(affine_t)) + { + return invoke(out, points, points.size(), scalars, mont, ffi_affine_sz); + } - RustError invoke(point_t& out, const std::vector& points, - const std::vector& scalars, bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) + RustError invoke(point_t &out, const std::vector &points, + const std::vector &scalars, bool mont = true, + size_t ffi_affine_sz = sizeof(affine_t)) { return invoke(out, points.data(), - std::min(points.size(), scalars.size()), - scalars.data(), mont, ffi_affine_sz); + std::min(points.size(), scalars.size()), + scalars.data(), mont, ffi_affine_sz); } private: - point_t integrate_row(const result_t& row, uint32_t lsbits) + point_t integrate_row(const result_t &row, uint32_t lsbits) { - const int NTHRBITS = lg2(MSM_NTHREADS/bucket_t::degree); + const int NTHRBITS = lg2(MSM_NTHREADS / bucket_t::degree); - assert(wbits-1 > NTHRBITS); + assert(wbits - 1 > NTHRBITS); - size_t i = MSM_NTHREADS/bucket_t::degree - 1; + size_t i = MSM_NTHREADS / bucket_t::degree - 1; - if (lsbits-1 <= NTHRBITS) { - size_t mask = (1U << (NTHRBITS-(lsbits-1))) - 1; + if (lsbits - 1 <= NTHRBITS) + { + size_t mask = (1U << (NTHRBITS - (lsbits - 1))) - 1; bucket_t res, acc = row[i][1]; - if (mask) res.inf(); - else res = acc; + if (mask) + res.inf(); + else + res = acc; - while (i--) { + while (i--) + { acc.add(row[i][1]); if ((i & mask) == 0) res.add(acc); @@ -626,12 +683,13 @@ private: return res; } - point_t res = row[i][0]; + point_t res = row[i][0]; bucket_t acc = row[i][1]; - while (i--) { + while (i--) + { point_t raise = acc; - for (size_t j = 0; j < lsbits-1-NTHRBITS; j++) + for (size_t j = 0; j < lsbits - 1 - NTHRBITS; j++) raise.dbl(); res.add(raise); res.add(row[i][0]); @@ -642,26 +700,28 @@ private: return res; } - void collect(point_t& out, const std::vector& res, - const std::vector& ones) + void collect(point_t &out, const std::vector &res, + const std::vector &ones) { - struct tile_t { + struct tile_t + { uint32_t x, y, dy; point_t p; tile_t() {} }; std::vector grid(nwins); - uint32_t y = nwins-1, total = 0; + uint32_t y = nwins - 1, total = 0; - grid[0].x = 0; - grid[0].y = y; - grid[0].dy = scalar_t::bit_length() - y*wbits; + grid[0].x = 0; + grid[0].y = y; + grid[0].dy = scalar_t::bit_length() - y * wbits; total++; - while (y--) { - grid[total].x = grid[0].x; - grid[total].y = y; + while (y--) + { + grid[total].x = grid[0].x; + grid[total].y = y; grid[total].dy = wbits; total++; } @@ -671,26 +731,29 @@ private: channel_t ch; auto n_workers = min((uint32_t)gpu.ncpus(), total); - while (n_workers--) { - gpu.spawn([&, this, total, counter]() { + while (n_workers--) + { + gpu.spawn([&, this, total, counter]() + { for (size_t work; (work = counter++) < total;) { auto item = &grid[work]; auto y = item->y; item->p = integrate_row(res[y], item->dy); if (++row_sync[y] == 1) ch.send(y); - } - }); + } }); } point_t one = sum_up(ones); out.inf(); size_t row = 0, ny = nwins; - while (ny--) { + while (ny--) + { auto y = ch.recv(); row_sync[y] = -1U; - while (grid[row].y == y) { + while (grid[row].y == y) + { while (row < total && grid[row].y == y) out.add(grid[row++].p); if (y == 0) @@ -705,16 +768,19 @@ private: } }; -template static -RustError mult_pippenger(point_t *out, const affine_t points[], size_t npoints, - const scalar_t scalars[], bool mont = true, - size_t ffi_affine_sz = sizeof(affine_t)) +template +static RustError mult_pippenger(point_t *out, const affine_t points[], size_t npoints, + const scalar_t scalars[], bool mont = true, + size_t ffi_affine_sz = sizeof(affine_t)) { - try { + try + { msm_t msm{nullptr, npoints}; return msm.invoke(*out, slice_t{points, npoints}, - scalars, mont, ffi_affine_sz); - } catch (const cuda_error& e) { + scalars, mont, ffi_affine_sz); + } + catch (const cuda_error &e) + { out->inf(); #ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE return RustError{e.code(), e.what()}; diff --git a/msm/sort.cuh b/msm/sort.cuh index 0b2928c..12e6b25 100644 --- a/msm/sort.cuh +++ b/msm/sort.cuh @@ -78,7 +78,7 @@ void zero_counters() } __device__ __forceinline__ -void count_digits(const uint32_t src[], uint32_t base, uint32_t len, +void upper_count_digits(const uint32_t src[], uint32_t base, uint32_t len, uint32_t lshift, uint32_t rshift, uint32_t mask) { zero_counters(); @@ -98,7 +98,7 @@ void count_digits(const uint32_t src[], uint32_t base, uint32_t len, } __device__ __forceinline__ -void scatter(uint2 dst[], const uint32_t src[], uint32_t base, uint32_t len, +void upper_scatter(uint2 dst[], const uint32_t src[], uint32_t base, uint32_t len, uint32_t lshift, uint32_t rshift, uint32_t mask, uint32_t pidx[] = nullptr) { @@ -120,7 +120,7 @@ void scatter(uint2 dst[], const uint32_t src[], uint32_t base, uint32_t len, __device__ static void upper_sort(uint2 dst[], const uint32_t src[], uint32_t len, uint32_t lsbits, uint32_t bits, uint32_t digit, - uint32_t histogram[]) + uint32_t histogram[], uint32_t pidx[] = nullptr) { uint32_t grid_div = 31 - __clz(gridDim.x); uint32_t grid_rem = (1< inouts, size_t len, uint32_t win, vec2d_t temps, vec2d_t histograms, - uint32_t wbits, uint32_t lsbits0, uint32_t lsbits1) + uint32_t wbits, uint32_t lsbits0, uint32_t lsbits1, + uint32_t pidx[] = nullptr) { win += blockIdx.y; sort_row(inouts[win], len, temps[blockIdx.y], histograms[win], - wbits, blockIdx.y==0 ? lsbits0 : lsbits1); + wbits, blockIdx.y==0 ? lsbits0 : lsbits1, pidx); } # undef asm