Skip to content

Commit

Permalink
use pidx
Browse files Browse the repository at this point in the history
  • Loading branch information
Hanting Zhang committed Feb 6, 2024
1 parent 1a0119a commit afd29f0
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 232 deletions.
185 changes: 120 additions & 65 deletions msm/batch_addition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,29 @@
#include <vector>

#ifndef WARP_SZ
# define WARP_SZ 32
#define WARP_SZ 32
#endif

#define BATCH_ADD_BLOCK_SIZE 256
#ifndef BATCH_ADD_NSTREAMS
# define BATCH_ADD_NSTREAMS 8
#define BATCH_ADD_NSTREAMS 8
#elif BATCH_ADD_NSTREAMS == 0
# error "invalid BATCH_ADD_NSTREAMS"
#error "invalid BATCH_ADD_NSTREAMS"
#endif

template<class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
__device__ __forceinline__
static void add(bucket_h ret[], const affine_h points[], uint32_t npoints,
const uint32_t bitmap[], const uint32_t refmap[],
bool accumulate, uint32_t sid)
template <class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
__device__ __forceinline__ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints,
const uint32_t bitmap[], const uint32_t refmap[],
bool accumulate, uint32_t sid)
{
static __device__ uint32_t streams[BATCH_ADD_NSTREAMS];
uint32_t& current = streams[sid % BATCH_ADD_NSTREAMS];
uint32_t &current = streams[sid % BATCH_ADD_NSTREAMS];

const uint32_t degree = bucket_t::degree;
const uint32_t warp_sz = WARP_SZ / degree;
const uint32_t tid = (threadIdx.x + blockDim.x*blockIdx.x) / degree;
const uint32_t tid = (threadIdx.x + blockDim.x * blockIdx.x) / degree;
const uint32_t xid = tid % warp_sz;

uint32_t laneid;
Expand All @@ -42,47 +41,54 @@ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints,
bucket_t acc;
acc.inf();

if (accumulate && tid < gridDim.x*blockDim.x/WARP_SZ)
if (accumulate && tid < gridDim.x * blockDim.x / WARP_SZ)
acc = ret[tid];

uint32_t base = laneid == 0 ? atomicAdd(&current, 32*WARP_SZ) : 0;
uint32_t base = laneid == 0 ? atomicAdd(&current, 32 * WARP_SZ) : 0;
base = __shfl_sync(0xffffffff, base, 0);

uint32_t chunk = min(32*WARP_SZ, npoints-base);
uint32_t chunk = min(32 * WARP_SZ, npoints - base);
uint32_t bits, refs, word, sign = 0, off = 0xffffffff;

for (uint32_t i = 0, j = 0; base < npoints;) {
if (i == 0) {
bits = bitmap[base/WARP_SZ + laneid];
refs = refmap ? refmap[base/WARP_SZ + laneid] : 0;
for (uint32_t i = 0, j = 0; base < npoints;)
{
if (i == 0)
{
bits = bitmap[base / WARP_SZ + laneid];
refs = refmap ? refmap[base / WARP_SZ + laneid] : 0;

bits ^= refs;
refs &= bits;
}

for (; i < chunk && j < warp_sz; i++) {
if (i%32 == 0)
word = __shfl_sync(0xffffffff, bits, i/32);
if (refmap && (i%32 == 0))
sign = __shfl_sync(0xffffffff, refs, i/32);
for (; i < chunk && j < warp_sz; i++)
{
if (i % 32 == 0)
word = __shfl_sync(0xffffffff, bits, i / 32);
if (refmap && (i % 32 == 0))
sign = __shfl_sync(0xffffffff, refs, i / 32);

if (word & 1) {
if (word & 1)
{
if (j++ == xid)
off = (base + i) | (sign << 31);
}
word >>= 1;
sign >>= 1;
}

if (i == chunk) {
base = laneid == 0 ? atomicAdd(&current, 32*WARP_SZ) : 0;
if (i == chunk)
{
base = laneid == 0 ? atomicAdd(&current, 32 * WARP_SZ) : 0;
base = __shfl_sync(0xffffffff, base, 0);
chunk = min(32*WARP_SZ, npoints-base);
chunk = min(32 * WARP_SZ, npoints - base);
i = 0;
}

if (base >= npoints || j == warp_sz) {
if (off != 0xffffffff) {
if (base >= npoints || j == warp_sz)
{
if (off != 0xffffffff)
{
affine_t p = points[off & 0x7fffffff];
if (degree == 2)
acc.uadd(p, off >> 31);
Expand All @@ -95,58 +101,64 @@ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints,
}

#ifdef __CUDA_ARCH__
for (uint32_t off = 1; off < warp_sz;) {
bucket_t down = acc.shfl_down(off*degree);
for (uint32_t off = 1; off < warp_sz;)
{
bucket_t down = acc.shfl_down(off * degree);

off <<= 1;
if ((xid & (off-1)) == 0)
if ((xid & (off - 1)) == 0)
acc.uadd(down); // .add() triggers spills ... in .shfl_down()
}
#endif

cooperative_groups::this_grid().sync();

if (xid == 0)
ret[tid/warp_sz] = acc;
ret[tid / warp_sz] = acc;

if (threadIdx.x + blockIdx.x == 0)
current = 0;
}

template<class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
template <class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
__launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__
void batch_addition(bucket_h ret[], const affine_h points[], uint32_t npoints,
const uint32_t bitmap[], bool accumulate = false,
uint32_t sid = 0)
{ add<bucket_t>(ret, points, npoints, bitmap, nullptr, accumulate, sid); }

template<class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
void batch_addition(bucket_h ret[], const affine_h points[], uint32_t npoints,
const uint32_t bitmap[], bool accumulate = false,
uint32_t sid = 0)
{
add<bucket_t>(ret, points, npoints, bitmap, nullptr, accumulate, sid);
}

template <class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
__launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__
void batch_diff(bucket_h ret[], const affine_h points[], uint32_t npoints,
const uint32_t bitmap[], const uint32_t refmap[],
bool accumulate = false, uint32_t sid = 0)
{ add<bucket_t>(ret, points, npoints, bitmap, refmap, accumulate, sid); }

template<class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
void batch_diff(bucket_h ret[], const affine_h points[], uint32_t npoints,
const uint32_t bitmap[], const uint32_t refmap[],
bool accumulate = false, uint32_t sid = 0)
{
add<bucket_t>(ret, points, npoints, bitmap, refmap, accumulate, sid);
}

template <class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
__launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__
void batch_addition(bucket_h ret[], const affine_h points[], size_t npoints,
const uint32_t digits[], const uint32_t& ndigits)
void batch_addition(bucket_h ret[], const affine_h points[], size_t npoints,
const uint32_t digits[], const uint32_t &ndigits)
{
const uint32_t degree = bucket_t::degree;
const uint32_t warp_sz = WARP_SZ / degree;
const uint32_t tid = (threadIdx.x + blockDim.x*blockIdx.x) / degree;
const uint32_t tid = (threadIdx.x + blockDim.x * blockIdx.x) / degree;
const uint32_t xid = tid % warp_sz;

bucket_t acc;
acc.inf();

for (size_t i = tid; i < ndigits; i += gridDim.x*blockDim.x/degree) {
for (size_t i = tid; i < ndigits; i += gridDim.x * blockDim.x / degree)
{
uint32_t digit = digits[i];
affine_t p = points[digit & 0x7fffffff];
if (degree == 2)
Expand All @@ -156,20 +168,21 @@ void batch_addition(bucket_h ret[], const affine_h points[], size_t npoints,
}

#ifdef __CUDA_ARCH__
for (uint32_t off = 1; off < warp_sz;) {
bucket_t down = acc.shfl_down(off*degree);
for (uint32_t off = 1; off < warp_sz;)
{
bucket_t down = acc.shfl_down(off * degree);

off <<= 1;
if ((xid & (off-1)) == 0)
if ((xid & (off - 1)) == 0)
acc.uadd(down); // .add() triggers spills ... in .shfl_down()
}
#endif

if (xid == 0)
ret[tid/warp_sz] = acc;
ret[tid / warp_sz] = acc;
}

template<class bucket_t>
template <class bucket_t>
bucket_t sum_up(const bucket_t inp[], size_t n)
{
bucket_t sum = inp[0];
Expand All @@ -178,7 +191,49 @@ bucket_t sum_up(const bucket_t inp[], size_t n)
return sum;
}

template<class bucket_t>
bucket_t sum_up(const std::vector<bucket_t>& inp)
{ return sum_up(&inp[0], inp.size()); }
template <class bucket_t>
bucket_t sum_up(const std::vector<bucket_t> &inp)
{
return sum_up(&inp[0], inp.size());
}

template <class bucket_t, class affine_h,
class bucket_h = class bucket_t::mem_t,
class affine_t = class bucket_t::affine_t>
__global__ void compress(bucket_h ret[], const affine_h points[], size_t npoints,
const uint32_t digits[], const uint32_t &ndigits)
{
const uint32_t degree = bucket_t::degree;
const uint32_t warp_sz = WARP_SZ / degree;
const uint32_t tid = (threadIdx.x + blockDim.x * blockIdx.x) / degree;
const uint32_t xid = tid % warp_sz;

bucket_t acc;
acc.inf();

for (size_t i = tid; i < ndigits; i += gridDim.x * blockDim.x / degree)
{
uint32_t digit = digits[i];
affine_t p = points[digit & 0x7fffffff];
if (degree == 2)
acc.uadd(p, digit >> 31);
else
acc.add(p, digit >> 31);
}

#ifdef __CUDA_ARCH__
for (uint32_t off = 1; off < warp_sz;)
{
bucket_t down = acc.shfl_down(off * degree);

off <<= 1;
if ((xid & (off - 1)) == 0)
acc.uadd(down); // .add() triggers spills ... in .shfl_down()
}
#endif

if (xid == 0)
ret[tid / warp_sz] = acc;
}

#endif
Loading

0 comments on commit afd29f0

Please sign in to comment.