Skip to content

Commit

Permalink
chore(gpu): add C++ functions to pop/push/insert in radix ciphertext
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Mar 6, 2025
1 parent b1e7ac7 commit b55a296
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 35 deletions.
46 changes: 11 additions & 35 deletions backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -196,24 +196,11 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto left_shift_interesting_remainder1 = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
// Pop
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], numerator_block_1, 0, 1,
numerator_block_stack, numerator_block_stack->num_radix_blocks - 1,
numerator_block_stack->num_radix_blocks);
reset_radix_ciphertext_blocks(
numerator_block_stack, numerator_block_stack->num_radix_blocks - 1);
// Insert
reset_radix_ciphertext_blocks(
interesting_remainder1, interesting_remainder1->num_radix_blocks + 1);
for (int j = interesting_remainder1->num_radix_blocks - 2; j >= 0; j--) {
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], interesting_remainder1, j + 1, j + 2,
interesting_remainder1, j, j + 1);
}
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
interesting_remainder1, 0, 1,
numerator_block_1, 0, 1);
pop_radix_ciphertext_block_async<Torus>(
streams[0], gpu_indexes[0], numerator_block_1, numerator_block_stack);
insert_block_in_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
numerator_block_1,
interesting_remainder1, 0);

host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1, 1,
Expand All @@ -229,24 +216,17 @@ __host__ void host_unsigned_integer_div_rem_kb(
streams, gpu_indexes, gpu_count, interesting_remainder1, tmp_radix, 1,
interesting_remainder1->num_radix_blocks);

// Pop
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], numerator_block_1, 0, 1,
interesting_remainder1, interesting_remainder1->num_radix_blocks - 1,
interesting_remainder1->num_radix_blocks);
reset_radix_ciphertext_blocks(
interesting_remainder1, interesting_remainder1->num_radix_blocks - 1);
pop_radix_ciphertext_block_async<Torus>(streams[0], gpu_indexes[0],
numerator_block_1,
interesting_remainder1);

if (pos_in_block != 0) {
// We have not yet extracted all the bits from this numerator
// so, we put it back on the front so that it gets taken next
// iteration
reset_radix_ciphertext_blocks(
numerator_block_stack, numerator_block_stack->num_radix_blocks + 1);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], numerator_block_stack,
numerator_block_stack->num_radix_blocks - 1,
numerator_block_stack->num_radix_blocks, numerator_block_1, 0, 1);
push_block_to_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
numerator_block_1,
numerator_block_stack);
}
}; // left_shift_interesting_remainder1

Expand Down Expand Up @@ -360,10 +340,6 @@ __host__ void host_unsigned_integer_div_rem_kb(
trivial_blocks->num_radix_blocks,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);

// reset_radix_ciphertext_blocks(
// tmp_1, ceil_div(trivial_blocks->num_radix_blocks,
// message_modulus * carry_modulus - 1));

is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count,
at_least_one_upper_block_is_non_zero, tmp_1,
Expand Down
42 changes: 42 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "device.h"
#include "integer/integer.h"
#include "integer/radix_ciphertext.h"
#include "utils/kernel_dimensions.cuh"

template <typename Torus>
Expand Down Expand Up @@ -197,4 +198,45 @@ __host__ void set_trivial_radix_ciphertext_async(
}
}

// Copy the last radix block of radix_in to the first block of radix_out and
// decrease radix_in num_radix_blocks by 1
template <typename Torus>
void pop_radix_ciphertext_block_async(cudaStream_t stream, uint32_t gpu_index,
CudaRadixCiphertextFFI *block,
CudaRadixCiphertextFFI *radix_in) {
copy_radix_ciphertext_slice_async<Torus>(
stream, gpu_index, block, 0, 1, radix_in, radix_in->num_radix_blocks - 1,
radix_in->num_radix_blocks);
reset_radix_ciphertext_blocks(radix_in, radix_in->num_radix_blocks - 1);
}
// Increase the number of blocks of radix_out by 1 and shift data left by one
// block starting from index, then copy the first block of radix_in to the block
// of radix out with the right index.
template <typename Torus>
void insert_block_in_radix_ciphertext_async(cudaStream_t stream,
uint32_t gpu_index,
CudaRadixCiphertextFFI *block,
CudaRadixCiphertextFFI *radix_out,
int index) {
reset_radix_ciphertext_blocks(radix_out, radix_out->num_radix_blocks + 1);
for (int j = radix_out->num_radix_blocks - 2; j >= index; j--) {
copy_radix_ciphertext_slice_async<Torus>(stream, gpu_index, radix_out,
j + 1, j + 2, radix_out, j, j + 1);
}
copy_radix_ciphertext_slice_async<Torus>(stream, gpu_index, radix_out, index,
index + 1, block, 0, 1);
}

// Increase the number of radix blocks of radix_out by 1 and copy
// the first block of radix_in to the last block of radix_out
template <typename Torus>
void push_block_to_radix_ciphertext_async(cudaStream_t stream,
uint32_t gpu_index,
CudaRadixCiphertextFFI *block,
CudaRadixCiphertextFFI *radix_out) {
reset_radix_ciphertext_blocks(radix_out, radix_out->num_radix_blocks + 1);
copy_radix_ciphertext_slice_async<Torus>(
stream, gpu_index, radix_out, radix_out->num_radix_blocks - 1,
radix_out->num_radix_blocks, block, 0, 1);
}
#endif

0 comments on commit b55a296

Please sign in to comment.