Skip to content

Commit

Permalink
bit reverse api and test, including cpu and cuda backends
Browse files Browse the repository at this point in the history
  • Loading branch information
yshekel committed Jul 1, 2024
1 parent 5f80fbf commit 4d059f9
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 11 deletions.
39 changes: 39 additions & 0 deletions icicle_v3/backend/cpu/src/field/cpu_vec_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,43 @@ eIcicleError cpu_matrix_transpose(
REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose<scalar_t>);
#ifdef EXT_FIELD
REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose<extension_t>);
#endif // EXT_FIELD

/*********************************** BIT REVERSE ***********************************/

template <typename T>
eIcicleError
cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out)
{
// Check for invalid arguments
if (!vec_in || !vec_out || size == 0) { return eIcicleError::INVALID_ARGUMENT; }

// Calculate log2(size)
int logn = static_cast<int>(std::floor(std::log2(size)));
if ((1ULL << logn) != size) {
return eIcicleError::INVALID_ARGUMENT; // Ensure size is a power of 2
}

// If vec_in and vec_out are not the same, copy input to output
if (vec_in != vec_out) {
for (uint64_t i = 0; i < size; ++i) {
vec_out[i] = vec_in[i];
}
}

// Perform the bit reverse
for (uint64_t i = 0; i < size; ++i) {
uint64_t rev = 0;
for (int j = 0; j < logn; ++j) {
if (i & (1ULL << j)) { rev |= 1ULL << (logn - 1 - j); }
}
if (i < rev) { std::swap(vec_out[i], vec_out[rev]); }
}

return eIcicleError::SUCCESS;
}

REGISTER_BIT_REVERSE_BACKEND("CPU", cpu_bit_reverse<scalar_t>);
#ifdef EXT_FIELD
REGISTER_BIT_REVERSE_EXT_FIELD_BACKEND("CPU", cpu_bit_reverse<extension_t>);
#endif // EXT_FIELD
71 changes: 71 additions & 0 deletions icicle_v3/backend/cuda/src/field/cuda_vec_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,68 @@ cudaError_t transpose_matrix(
return CHK_LAST();
}

template <typename E>
__global__ void bit_reverse_kernel(const E* input, uint64_t n, unsigned shift, E* output)
{
uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
// Handling arbitrary vector size
if (tid < n) {
int reversed_index = __brevll(tid) >> shift;
output[reversed_index] = input[tid];
}
}
template <typename E>
__global__ void bit_reverse_inplace_kernel(E* input, uint64_t n, unsigned shift)
{
uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
// Handling arbitrary vector size
if (tid < n) {
int reversed_index = __brevll(tid) >> shift;
if (reversed_index > tid) {
E temp = input[tid];
input[tid] = input[reversed_index];
input[reversed_index] = temp;
}
}
}

template <typename E>
cudaError_t bit_reverse_cuda_impl(const E* input, uint64_t size, const VecOpsConfig& cfg, E* output)
{
cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(cfg.stream);

if (size & (size - 1)) THROW_ICICLE_ERR(eIcicleError::INVALID_ARGUMENT, "bit_reverse: size must be a power of 2");
if ((input == output) & (cfg.is_a_on_device != cfg.is_result_on_device))
THROW_ICICLE_ERR(
eIcicleError::INVALID_ARGUMENT, "bit_reverse: equal devices should have same is_on_device parameters");

E* d_output;
if (cfg.is_result_on_device) {
d_output = output;
} else {
// allocate output on gpu
CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cuda_stream));
}

uint64_t shift = __builtin_clzll(size) + 1;
uint64_t num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;

if ((input != output) & cfg.is_a_on_device) {
bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cuda_stream>>>(input, size, shift, d_output);
} else {
if (!cfg.is_a_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(d_output, input, sizeof(E) * size, cudaMemcpyHostToDevice, cuda_stream));
}
bit_reverse_inplace_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cuda_stream>>>(d_output, size, shift);
}
if (!cfg.is_result_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cuda_stream));
CHK_IF_RETURN(cudaFreeAsync(d_output, cuda_stream));
}
if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cuda_stream));
return CHK_LAST();
}

/************************************ REGISTRATION ************************************/

#include "icicle/fields/field_config.h"
Expand Down Expand Up @@ -203,14 +265,23 @@ eIcicleError matrix_transpose_cuda(
return translateCudaError(err);
}

template <typename T>
eIcicleError bit_reverse_cuda(const Device& device, const T* in, uint64_t size, const VecOpsConfig& config, T* out)
{
auto err = bit_reverse_cuda_impl<T>(in, size, config, out);
return translateCudaError(err);
}

REGISTER_VECTOR_ADD_BACKEND("CUDA", add_cuda<scalar_t>);
REGISTER_VECTOR_SUB_BACKEND("CUDA", sub_cuda<scalar_t>);
REGISTER_VECTOR_MUL_BACKEND("CUDA", mul_cuda<scalar_t>);
REGISTER_MATRIX_TRANSPOSE_BACKEND("CUDA", matrix_transpose_cuda<scalar_t>);
REGISTER_BIT_REVERSE_BACKEND("CUDA", bit_reverse_cuda<scalar_t>);

#ifdef EXT_FIELD
REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CUDA", add_cuda<extension_t>);
REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CUDA", sub_cuda<extension_t>);
REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CUDA", mul_cuda<extension_t>);
REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CUDA", matrix_transpose_cuda<extension_t>);
REGISTER_BIT_REVERSE_EXT_FIELD_BACKEND("CUDA", bit_reverse_cuda<extension_t>);
#endif // EXT_FIELD
51 changes: 40 additions & 11 deletions icicle_v3/include/icicle/vec_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace icicle {
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
* function will block the current CPU thread. */

ConfigExtension* ext = nullptr; /** backend specific extensions*/
ConfigExtension* ext = nullptr; /** backend specific extensionT*/
};

/**
Expand All @@ -46,21 +46,24 @@ namespace icicle {

// template APIs

template <typename S>
eIcicleError vector_add(const S* vec_a, const S* vec_b, uint64_t n, const VecOpsConfig& config, S* output);
template <typename T>
eIcicleError vector_add(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);

template <typename S>
eIcicleError vector_sub(const S* vec_a, const S* vec_b, uint64_t n, const VecOpsConfig& config, S* output);
template <typename T>
eIcicleError vector_sub(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);

template <typename S>
eIcicleError vector_mul(const S* vec_a, const S* vec_b, uint64_t n, const VecOpsConfig& config, S* output);
template <typename T>
eIcicleError vector_mul(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);

template <typename S>
eIcicleError convert_montgomery(const S* input, uint64_t size, bool is_into, const VecOpsConfig& config, S* output);
template <typename T>
eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output);

template <typename E>
template <typename T>
eIcicleError
matrix_transpose(const E* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, E* mat_out);
matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out);

template <typename T>
eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out);

/*************************** Backend registration ***************************/

Expand Down Expand Up @@ -137,6 +140,19 @@ namespace icicle {
}(); \
}

using scalarBitReverseOpImpl = std::function<eIcicleError(
const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output)>;

void register_scalar_bit_reverse(const std::string& deviceType, scalarBitReverseOpImpl);

#define REGISTER_BIT_REVERSE_BACKEND(DEVICE_TYPE, FUNC) \
namespace { \
static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool { \
register_scalar_bit_reverse(DEVICE_TYPE, FUNC); \
return true; \
}(); \
}

#ifdef EXT_FIELD
using extFieldVectorOpImpl = std::function<eIcicleError(
const Device& device,
Expand Down Expand Up @@ -211,6 +227,19 @@ namespace icicle {
return true; \
}(); \
}

using extFieldBitReverseOpImpl = std::function<eIcicleError(
const Device& device, const extension_t* input, uint64_t size, const VecOpsConfig& config, extension_t* output)>;

void register_extension_bit_reverse(const std::string& deviceType, extFieldBitReverseOpImpl);

#define REGISTER_BIT_REVERSE_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \
namespace { \
static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool { \
register_extension_bit_reverse(DEVICE_TYPE, FUNC); \
return true; \
}(); \
}
#endif // EXT_FIELD

} // namespace icicle
32 changes: 32 additions & 0 deletions icicle_v3/src/vec_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,36 @@ namespace icicle {
}
#endif // EXT_FIELD

/*********************************** BIT REVERSE ***********************************/

ICICLE_DISPATCHER_INST(ScalarBitReverseDispatcher, scalar_bit_reverse, scalarBitReverseOpImpl)

extern "C" eIcicleError
CONCAT_EXPAND(FIELD, bit_reverse)(const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output)
{
return ScalarBitReverseDispatcher::execute(input, size, config, output);
}

template <>
eIcicleError bit_reverse(const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output)
{
return CONCAT_EXPAND(FIELD, bit_reverse)(input, size, config, output);
}

#ifdef EXT_FIELD
ICICLE_DISPATCHER_INST(ExtFieldBitReverseDispatcher, extension_bit_reverse, extFieldBitReverseOpImpl)

extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_bit_reverse)(
const extension_t* input, uint64_t size, const VecOpsConfig& config, extension_t* output)
{
return ExtFieldBitReverseDispatcher::execute(input, size, config, output);
}

template <>
eIcicleError bit_reverse(const extension_t* input, uint64_t size, const VecOpsConfig& config, extension_t* output)
{
return CONCAT_EXPAND(FIELD, extension_bit_reverse)(input, size, config, output);
}
#endif // EXT_FIELD

} // namespace icicle
28 changes: 28 additions & 0 deletions icicle_v3/tests/test_field_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,34 @@ TYPED_TEST(FieldApiTest, montgomeryConversion)
ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam)));
}

TYPED_TEST(FieldApiTest, bitReverse)
{
const int N = 1 << 18;
auto elements_main = std::make_unique<TypeParam[]>(N);
auto elements_ref = std::make_unique<TypeParam[]>(N);
FieldApiTest<TypeParam>::random_samples(elements_main.get(), N);
memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam));

auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) {
Device dev = {dev_type, 0};
icicle_set_device(dev);
auto config = default_vec_ops_config();

std::ostringstream oss;
oss << dev_type << " " << msg;

START_TIMER(BIT_REVERSE)
for (int i = 0; i < iters; ++i) {
ICICLE_CHECK(bit_reverse(inout, N, config, inout));
}
END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
};

run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "bit-reverse", 1);
run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "bit-reverse", 1);
ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam)));
}

#ifdef NTT_ENABLED
TYPED_TEST(FieldApiTest, ntt)
{
Expand Down

0 comments on commit 4d059f9

Please sign in to comment.