Skip to content

Commit

Permalink
chore(gpu): refactor div to track noise level & degree
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Mar 3, 2025
1 parent 1b7c2b6 commit dff1aba
Show file tree
Hide file tree
Showing 21 changed files with 526 additions and 2,110 deletions.
7 changes: 4 additions & 3 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -416,9 +416,10 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(

void cuda_integer_div_rem_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *quotient, void *remainder, void const *numerator, void const *divisor,
bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks_in_radix);
CudaRadixCiphertextFFI *quotient, CudaRadixCiphertextFFI *remainder,
CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);

void cleanup_cuda_integer_div_rem(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down
238 changes: 142 additions & 96 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

Large diffs are not rendered by default.

38 changes: 0 additions & 38 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,44 +28,6 @@ __host__ void scratch_cuda_integer_abs_kb(
num_blocks, allocate_gpu_memory);
}

template <typename Torus>
__host__ void legacy_host_integer_abs_kb_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed, uint32_t num_blocks) {
if (!is_signed)
return;

auto radix_params = mem_ptr->params;
auto mask = (Torus *)(mem_ptr->mask->ptr);

auto big_lwe_dimension = radix_params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
uint32_t num_bits_in_ciphertext =
(31 - __builtin_clz(radix_params.message_modulus)) * num_blocks;

cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);

legacy_host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);

uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
legacy_host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
bsks, ksks, num_blocks, requested_flag, uses_carry);

// legacy bitop
legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, ct, mask, ct, bsks, ksks, num_blocks,
mem_ptr->bitxor_mem->lut, mem_ptr->bitxor_mem->params.message_modulus);
}

template <typename Torus>
__host__ void
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
Expand Down
6 changes: 6 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ __host__ void host_integer_radix_bitop_kb(
CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {

if (lwe_array_out->num_radix_blocks != lwe_array_1->num_radix_blocks ||
lwe_array_out->num_radix_blocks != lwe_array_2->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be equal")
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
PANIC("Cuda error: input and output lwe dimension must be equal")
auto lut = mem_ptr->lut;
uint64_t degrees[lwe_array_1->num_radix_blocks];
if (mem_ptr->op == BITOP_TYPE::BITAND) {
Expand Down
44 changes: 0 additions & 44 deletions backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,50 +37,6 @@ __host__ void zero_out_if(cudaStream_t const *streams,
ksks, predicate, num_radix_blocks);
}

template <typename Torus>
__host__ void legacy_host_integer_radix_cmux_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition,
Torus const *lwe_array_true, Torus const *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
uint32_t num_radix_blocks) {

auto params = mem_ptr->params;
Torus lwe_size = params.big_lwe_dimension + 1;
Torus radix_lwe_size = lwe_size * num_radix_blocks;
cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in->ptr, lwe_array_true,
radix_lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
(Torus *)(mem_ptr->buffer_in->ptr) + radix_lwe_size, lwe_array_false,
radix_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
for (uint i = 0; i < 2 * num_radix_blocks; i++) {
cuda_memcpy_async_gpu_to_gpu(
(Torus *)(mem_ptr->condition_array->ptr) + i * lwe_size, lwe_condition,
lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
}
legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(mem_ptr->buffer_out->ptr),
(Torus *)(mem_ptr->buffer_in->ptr),
(Torus *)(mem_ptr->condition_array->ptr), bsks, ksks,
2 * num_radix_blocks, mem_ptr->predicate_lut, params.message_modulus);

// If the condition was true, true_ct will have kept its value and false_ct
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
// have kept its value
auto mem_true = (Torus *)(mem_ptr->buffer_out->ptr);
auto ptr = (Torus *)mem_ptr->buffer_out->ptr;
auto mem_false = &ptr[radix_lwe_size];
auto added_cts = mem_true;
legacy_host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
mem_false, params.big_lwe_dimension,
num_radix_blocks);

legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
num_radix_blocks, mem_ptr->message_extract_lut);
}

template <typename Torus>
__host__ void host_integer_radix_cmux_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
Expand Down
236 changes: 0 additions & 236 deletions backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,113 +49,6 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
check_cuda_error(cudaGetLastError());
}

template <typename Torus>
__host__ void legacy_are_all_comparisons_block_true(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {

auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;

auto are_all_block_true_buffer =
mem_ptr->eq_buffer->are_all_block_true_buffer;
auto tmp_out = (Torus *)are_all_block_true_buffer->tmp_out->ptr;

uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);

cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);

uint32_t remaining_blocks = num_radix_blocks;

while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = (remaining_blocks + max_value - 1) / max_value;

// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
auto input_blocks = tmp_out;
auto accumulator_ptr =
(Torus *)are_all_block_true_buffer->tmp_block_accumulated->ptr;
auto is_max_value_lut = are_all_block_true_buffer->is_max_value;
uint32_t chunk_lengths[num_chunks];
auto begin_remaining_blocks = remaining_blocks;
for (int i = 0; i < num_chunks; i++) {
uint32_t chunk_length =
std::min(max_value, begin_remaining_blocks - i * max_value);
chunk_lengths[i] = chunk_length;
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator_ptr,
input_blocks, big_lwe_dimension,
chunk_length);

accumulator_ptr += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
input_blocks += (big_lwe_dimension + 1) * chunk_length;
}
auto accumulator =
(Torus *)are_all_block_true_buffer->tmp_block_accumulated->ptr;

// Selects a LUT
int_radix_lut<Torus> *lut;
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
// is_non_zero_lut_buffer LUT
lut = mem_ptr->eq_buffer->is_non_zero_lut;
} else {
if (chunk_lengths[num_chunks - 1] != max_value) {
// LUT needs to be computed
uint32_t chunk_length = chunk_lengths[num_chunks - 1];
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
return x == chunk_length;
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
is_max_value_lut->get_degree(1),
is_max_value_lut->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f);

Torus *h_lut_indexes = (Torus *)malloc(num_chunks * sizeof(Torus));
for (int index = 0; index < num_chunks; index++) {
if (index == num_chunks - 1) {
h_lut_indexes[index] = 1;
} else {
h_lut_indexes[index] = 0;
}
}
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
h_lut_indexes, num_chunks * sizeof(Torus),
streams[0], gpu_indexes[0]);
is_max_value_lut->broadcast_lut(streams, gpu_indexes, 0);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
}
lut = is_max_value_lut;
}

// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
return;
} else {
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
num_chunks, lut);
}
}
}

/* This takes an array of lwe ciphertexts, where each is an encryption of
* either 0 or 1.
*
Expand Down Expand Up @@ -275,72 +168,6 @@ __host__ void are_all_comparisons_block_true(
}
}

template <typename Torus>
__host__ void legacy_is_at_least_one_comparisons_block_true(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {

auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;

auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;

uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);

cuda_memcpy_async_gpu_to_gpu(
(Torus *)mem_ptr->tmp_lwe_array_out->ptr, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), streams[0],
gpu_indexes[0]);

uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = (remaining_blocks + max_value - 1) / max_value;

// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
auto input_blocks = (Torus *)mem_ptr->tmp_lwe_array_out->ptr;
auto accumulator = (Torus *)buffer->tmp_block_accumulated->ptr;
uint32_t chunk_lengths[num_chunks];
auto begin_remaining_blocks = remaining_blocks;
for (int i = 0; i < num_chunks; i++) {
uint32_t chunk_length =
std::min(max_value, begin_remaining_blocks - i * max_value);
chunk_lengths[i] = chunk_length;
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension,
chunk_length);

accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
input_blocks += (big_lwe_dimension + 1) * chunk_length;
}
accumulator = (Torus *)buffer->tmp_block_accumulated->ptr;

// Selects a LUT
int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;

// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
return;
} else {
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
(Torus *)mem_ptr->tmp_lwe_array_out->ptr, accumulator, bsks, ksks,
num_chunks, lut);
}
}
}

/* This takes an array of lwe ciphertexts, where each is an encryption of
* either 0 or 1.
*
Expand Down Expand Up @@ -417,69 +244,6 @@ __host__ void is_at_least_one_comparisons_block_true(
}
}

template <typename Torus>
__host__ void legacy_host_compare_with_zero_equality(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, int32_t num_radix_blocks,
int_radix_lut<Torus> *zero_comparison) {

auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;

// The idea is that we will sum chunks of blocks until carries are full
// then we compare the sum with 0.
//
// If all blocks were 0, the sum will be zero
// If at least one bock was not zero, the sum won't be zero
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t message_max = message_modulus - 1;

uint32_t num_elements_to_fill_carry = (total_modulus - 1) / message_max;

size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

int num_sum_blocks = 0;
// Accumulator
auto sum = lwe_array_out;

if (num_radix_blocks == 1) {
// Just copy
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
num_sum_blocks = 1;
} else {
uint32_t remainder_blocks = num_radix_blocks;
auto sum_i = sum;
auto chunk = lwe_array_in;
while (remainder_blocks > 1) {
uint32_t chunk_size =
std::min(remainder_blocks, num_elements_to_fill_carry);

accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
big_lwe_dimension, chunk_size);

num_sum_blocks++;
remainder_blocks -= (chunk_size - 1);

// Update operands
chunk += (chunk_size - 1) * big_lwe_size;
sum_i += big_lwe_size;
}
}

legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
zero_comparison);
legacy_are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
lwe_array_out, sum, mem_ptr,
bsks, ksks, num_sum_blocks);
}

// FIXME This function should be improved as it outputs a single LWE ciphertext
// but requires the output to have enough blocks allocated to compute
// intermediate values
Expand Down
Loading

0 comments on commit dff1aba

Please sign in to comment.