diff --git a/icicle_v3/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle_v3/backend/cpu/src/field/cpu_vec_ops.cpp index 17fd2b06c1..68116c6203 100644 --- a/icicle_v3/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle_v3/backend/cpu/src/field/cpu_vec_ops.cpp @@ -89,4 +89,43 @@ eIcicleError cpu_matrix_transpose( REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose); #ifdef EXT_FIELD REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose); +#endif // EXT_FIELD + +/*********************************** BIT REVERSE ***********************************/ + +template +eIcicleError +cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out) +{ + // Check for invalid arguments + if (!vec_in || !vec_out || size == 0) { return eIcicleError::INVALID_ARGUMENT; } + + // Calculate log2(size) + int logn = static_cast(std::floor(std::log2(size))); + if ((1ULL << logn) != size) { + return eIcicleError::INVALID_ARGUMENT; // Ensure size is a power of 2 + } + + // If vec_in and vec_out are not the same, copy input to output + if (vec_in != vec_out) { + for (uint64_t i = 0; i < size; ++i) { + vec_out[i] = vec_in[i]; + } + } + + // Perform the bit reverse + for (uint64_t i = 0; i < size; ++i) { + uint64_t rev = 0; + for (int j = 0; j < logn; ++j) { + if (i & (1ULL << j)) { rev |= 1ULL << (logn - 1 - j); } + } + if (i < rev) { std::swap(vec_out[i], vec_out[rev]); } + } + + return eIcicleError::SUCCESS; +} + +REGISTER_BIT_REVERSE_BACKEND("CPU", cpu_bit_reverse); +#ifdef EXT_FIELD +REGISTER_BIT_REVERSE_EXT_FIELD_BACKEND("CPU", cpu_bit_reverse); #endif // EXT_FIELD \ No newline at end of file diff --git a/icicle_v3/backend/cuda/src/field/cuda_vec_ops.cu b/icicle_v3/backend/cuda/src/field/cuda_vec_ops.cu index a84a4d27ac..a88b6926bd 100644 --- a/icicle_v3/backend/cuda/src/field/cuda_vec_ops.cu +++ b/icicle_v3/backend/cuda/src/field/cuda_vec_ops.cu @@ -165,6 +165,68 @@ cudaError_t transpose_matrix( return CHK_LAST(); } +template +__global__ void bit_reverse_kernel(const E* input, uint64_t n, unsigned shift, E* output) +{ + uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x; + // Handling arbitrary vector size + if (tid < n) { + int reversed_index = __brevll(tid) >> shift; + output[reversed_index] = input[tid]; + } +} +template +__global__ void bit_reverse_inplace_kernel(E* input, uint64_t n, unsigned shift) +{ + uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x; + // Handling arbitrary vector size + if (tid < n) { + int reversed_index = __brevll(tid) >> shift; + if (reversed_index > tid) { + E temp = input[tid]; + input[tid] = input[reversed_index]; + input[reversed_index] = temp; + } + } +} + +template +cudaError_t bit_reverse_cuda_impl(const E* input, uint64_t size, const VecOpsConfig& cfg, E* output) +{ + cudaStream_t cuda_stream = reinterpret_cast(cfg.stream); + + if (size & (size - 1)) THROW_ICICLE_ERR(eIcicleError::INVALID_ARGUMENT, "bit_reverse: size must be a power of 2"); + if ((input == output) & (cfg.is_a_on_device != cfg.is_result_on_device)) + THROW_ICICLE_ERR( + eIcicleError::INVALID_ARGUMENT, "bit_reverse: equal devices should have same is_on_device parameters"); + + E* d_output; + if (cfg.is_result_on_device) { + d_output = output; + } else { + // allocate output on gpu + CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cuda_stream)); + } + + uint64_t shift = __builtin_clzll(size) + 1; + uint64_t num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; + + if ((input != output) & cfg.is_a_on_device) { + bit_reverse_kernel<<>>(input, size, shift, d_output); + } else { + if (!cfg.is_a_on_device) { + CHK_IF_RETURN(cudaMemcpyAsync(d_output, input, sizeof(E) * size, cudaMemcpyHostToDevice, cuda_stream)); + } + bit_reverse_inplace_kernel<<>>(d_output, size, shift); + } + if (!cfg.is_result_on_device) { + CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cuda_stream)); + CHK_IF_RETURN(cudaFreeAsync(d_output, cuda_stream)); + } + if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cuda_stream)); + return CHK_LAST(); +} + /************************************ REGISTRATION ************************************/ #include "icicle/fields/field_config.h" @@ -203,14 +265,23 @@ eIcicleError matrix_transpose_cuda( return translateCudaError(err); } +template +eIcicleError bit_reverse_cuda(const Device& device, const T* in, uint64_t size, const VecOpsConfig& config, T* out) +{ + auto err = bit_reverse_cuda_impl(in, size, config, out); + return translateCudaError(err); +} + REGISTER_VECTOR_ADD_BACKEND("CUDA", add_cuda); REGISTER_VECTOR_SUB_BACKEND("CUDA", sub_cuda); REGISTER_VECTOR_MUL_BACKEND("CUDA", mul_cuda); REGISTER_MATRIX_TRANSPOSE_BACKEND("CUDA", matrix_transpose_cuda); +REGISTER_BIT_REVERSE_BACKEND("CUDA", bit_reverse_cuda); #ifdef EXT_FIELD REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CUDA", add_cuda); REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CUDA", sub_cuda); REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CUDA", mul_cuda); REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CUDA", matrix_transpose_cuda); +REGISTER_BIT_REVERSE_EXT_FIELD_BACKEND("CUDA", bit_reverse_cuda); #endif // EXT_FIELD diff --git a/icicle_v3/include/icicle/vec_ops.h b/icicle_v3/include/icicle/vec_ops.h index 880c3602a5..c30d420fd1 100644 --- a/icicle_v3/include/icicle/vec_ops.h +++ b/icicle_v3/include/icicle/vec_ops.h @@ -25,7 +25,7 @@ namespace icicle { * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the * function will block the current CPU thread. */ - ConfigExtension* ext = nullptr; /** backend specific extensions*/ + ConfigExtension* ext = nullptr; /** backend specific extensionT*/ }; /** @@ -46,21 +46,24 @@ namespace icicle { // template APIs - template - eIcicleError vector_add(const S* vec_a, const S* vec_b, uint64_t n, const VecOpsConfig& config, S* output); + template + eIcicleError vector_add(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); - template - eIcicleError vector_sub(const S* vec_a, const S* vec_b, uint64_t n, const VecOpsConfig& config, S* output); + template + eIcicleError vector_sub(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); - template - eIcicleError vector_mul(const S* vec_a, const S* vec_b, uint64_t n, const VecOpsConfig& config, S* output); + template + eIcicleError vector_mul(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); - template - eIcicleError convert_montgomery(const S* input, uint64_t size, bool is_into, const VecOpsConfig& config, S* output); + template + eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output); - template + template eIcicleError - matrix_transpose(const E* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, E* mat_out); + matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out); + + template + eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out); /*************************** Backend registration ***************************/ @@ -137,6 +140,19 @@ namespace icicle { }(); \ } + using scalarBitReverseOpImpl = std::function; + + void register_scalar_bit_reverse(const std::string& deviceType, scalarBitReverseOpImpl); + +#define REGISTER_BIT_REVERSE_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool { \ + register_scalar_bit_reverse(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + #ifdef EXT_FIELD using extFieldVectorOpImpl = std::function; + + void register_extension_bit_reverse(const std::string& deviceType, extFieldBitReverseOpImpl); + +#define REGISTER_BIT_REVERSE_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool { \ + register_extension_bit_reverse(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } #endif // EXT_FIELD } // namespace icicle \ No newline at end of file diff --git a/icicle_v3/src/vec_ops.cpp b/icicle_v3/src/vec_ops.cpp index 8537092f9e..341486542f 100644 --- a/icicle_v3/src/vec_ops.cpp +++ b/icicle_v3/src/vec_ops.cpp @@ -137,4 +137,36 @@ namespace icicle { } #endif // EXT_FIELD + /*********************************** BIT REVERSE ***********************************/ + + ICICLE_DISPATCHER_INST(ScalarBitReverseDispatcher, scalar_bit_reverse, scalarBitReverseOpImpl) + + extern "C" eIcicleError + CONCAT_EXPAND(FIELD, bit_reverse)(const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output) + { + return ScalarBitReverseDispatcher::execute(input, size, config, output); + } + + template <> + eIcicleError bit_reverse(const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output) + { + return CONCAT_EXPAND(FIELD, bit_reverse)(input, size, config, output); + } + +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(ExtFieldBitReverseDispatcher, extension_bit_reverse, extFieldBitReverseOpImpl) + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_bit_reverse)( + const extension_t* input, uint64_t size, const VecOpsConfig& config, extension_t* output) + { + return ExtFieldBitReverseDispatcher::execute(input, size, config, output); + } + + template <> + eIcicleError bit_reverse(const extension_t* input, uint64_t size, const VecOpsConfig& config, extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_bit_reverse)(input, size, config, output); + } +#endif // EXT_FIELD + } // namespace icicle \ No newline at end of file diff --git a/icicle_v3/tests/test_field_api.cpp b/icicle_v3/tests/test_field_api.cpp index 6b993c32dd..fedf265659 100644 --- a/icicle_v3/tests/test_field_api.cpp +++ b/icicle_v3/tests/test_field_api.cpp @@ -200,6 +200,34 @@ TYPED_TEST(FieldApiTest, montgomeryConversion) ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam))); } +TYPED_TEST(FieldApiTest, bitReverse) +{ + const int N = 1 << 18; + auto elements_main = std::make_unique(N); + auto elements_ref = std::make_unique(N); + FieldApiTest::random_samples(elements_main.get(), N); + memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam)); + + auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(BIT_REVERSE) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(bit_reverse(inout, N, config, inout)); + } + END_TIMER(BIT_REVERSE, oss.str().c_str(), measure); + }; + + run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "bit-reverse", 1); + run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "bit-reverse", 1); + ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam))); +} + #ifdef NTT_ENABLED TYPED_TEST(FieldApiTest, ntt) {