chore(gpu): refactor div to track noise level & degree

zama-ai · Mar 3, 2025 · dff1aba · dff1aba
1 parent 1b7c2b6
commit dff1aba
Show file tree

Hide file tree

Showing 21 changed files with 526 additions and 2,110 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -416,9 +416,10 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
 
 void cuda_integer_div_rem_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *quotient, void *remainder, void const *numerator, void const *divisor,
-    bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix);
+    CudaRadixCiphertextFFI *quotient, CudaRadixCiphertextFFI *remainder,
+    CudaRadixCiphertextFFI const *numerator,
+    CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks);
 
 void cleanup_cuda_integer_div_rem(void *const *streams,
                                   uint32_t const *gpu_indexes,

diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -28,44 +28,6 @@ __host__ void scratch_cuda_integer_abs_kb(
                                   num_blocks, allocate_gpu_memory);
 }
 
-template <typename Torus>
-__host__ void legacy_host_integer_abs_kb_async(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks,
-    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed, uint32_t num_blocks) {
-  if (!is_signed)
-    return;
-
-  auto radix_params = mem_ptr->params;
-  auto mask = (Torus *)(mem_ptr->mask->ptr);
-
-  auto big_lwe_dimension = radix_params.big_lwe_dimension;
-  auto big_lwe_size = big_lwe_dimension + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-  uint32_t num_bits_in_ciphertext =
-      (31 - __builtin_clz(radix_params.message_modulus)) * num_blocks;
-
-  cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
-                               streams[0], gpu_indexes[0]);
-
-  legacy_host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
-      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
-  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
-                              radix_params.big_lwe_dimension, num_blocks);
-
-  uint32_t requested_flag = outputFlag::FLAG_NONE;
-  uint32_t uses_carry = 0;
-  legacy_host_propagate_single_carry<Torus>(
-      streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
-      bsks, ksks, num_blocks, requested_flag, uses_carry);
-
-  // legacy bitop
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, ct, mask, ct, bsks, ksks, num_blocks,
-      mem_ptr->bitxor_mem->lut, mem_ptr->bitxor_mem->params.message_modulus);
-}
-
 template <typename Torus>
 __host__ void
 host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -19,6 +19,12 @@ __host__ void host_integer_radix_bitop_kb(
     CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
     void *const *bsks, Torus *const *ksks) {
 
+  if (lwe_array_out->num_radix_blocks != lwe_array_1->num_radix_blocks ||
+      lwe_array_out->num_radix_blocks != lwe_array_2->num_radix_blocks)
+    PANIC("Cuda error: input and output num radix blocks must be equal")
+  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
+      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
+    PANIC("Cuda error: input and output lwe dimension must be equal")
   auto lut = mem_ptr->lut;
   uint64_t degrees[lwe_array_1->num_radix_blocks];
   if (mem_ptr->op == BITOP_TYPE::BITAND) {

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -37,50 +37,6 @@ __host__ void zero_out_if(cudaStream_t const *streams,
       ksks, predicate, num_radix_blocks);
 }
 
-template <typename Torus>
-__host__ void legacy_host_integer_radix_cmux_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition,
-    Torus const *lwe_array_true, Torus const *lwe_array_false,
-    int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
-    uint32_t num_radix_blocks) {
-
-  auto params = mem_ptr->params;
-  Torus lwe_size = params.big_lwe_dimension + 1;
-  Torus radix_lwe_size = lwe_size * num_radix_blocks;
-  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in->ptr, lwe_array_true,
-                               radix_lwe_size * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
-  cuda_memcpy_async_gpu_to_gpu(
-      (Torus *)(mem_ptr->buffer_in->ptr) + radix_lwe_size, lwe_array_false,
-      radix_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
-  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
-    cuda_memcpy_async_gpu_to_gpu(
-        (Torus *)(mem_ptr->condition_array->ptr) + i * lwe_size, lwe_condition,
-        lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
-  }
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, (Torus *)(mem_ptr->buffer_out->ptr),
-      (Torus *)(mem_ptr->buffer_in->ptr),
-      (Torus *)(mem_ptr->condition_array->ptr), bsks, ksks,
-      2 * num_radix_blocks, mem_ptr->predicate_lut, params.message_modulus);
-
-  // If the condition was true, true_ct will have kept its value and false_ct
-  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
-  // have kept its value
-  auto mem_true = (Torus *)(mem_ptr->buffer_out->ptr);
-  auto ptr = (Torus *)mem_ptr->buffer_out->ptr;
-  auto mem_false = &ptr[radix_lwe_size];
-  auto added_cts = mem_true;
-  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
-                              mem_false, params.big_lwe_dimension,
-                              num_radix_blocks);
-
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
-      num_radix_blocks, mem_ptr->message_extract_lut);
-}
-
 template <typename Torus>
 __host__ void host_integer_radix_cmux_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -49,113 +49,6 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
   check_cuda_error(cudaGetLastError());
 }
 
-template <typename Torus>
-__host__ void legacy_are_all_comparisons_block_true(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  auto are_all_block_true_buffer =
-      mem_ptr->eq_buffer->are_all_block_true_buffer;
-  auto tmp_out = (Torus *)are_all_block_true_buffer->tmp_out->ptr;
-
-  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
-
-  cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
-                               num_radix_blocks * (big_lwe_dimension + 1) *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-
-  uint32_t remaining_blocks = num_radix_blocks;
-
-  while (remaining_blocks > 0) {
-    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
-
-    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
-    // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = tmp_out;
-    auto accumulator_ptr =
-        (Torus *)are_all_block_true_buffer->tmp_block_accumulated->ptr;
-    auto is_max_value_lut = are_all_block_true_buffer->is_max_value;
-    uint32_t chunk_lengths[num_chunks];
-    auto begin_remaining_blocks = remaining_blocks;
-    for (int i = 0; i < num_chunks; i++) {
-      uint32_t chunk_length =
-          std::min(max_value, begin_remaining_blocks - i * max_value);
-      chunk_lengths[i] = chunk_length;
-      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator_ptr,
-                                   input_blocks, big_lwe_dimension,
-                                   chunk_length);
-
-      accumulator_ptr += (big_lwe_dimension + 1);
-      remaining_blocks -= (chunk_length - 1);
-      input_blocks += (big_lwe_dimension + 1) * chunk_length;
-    }
-    auto accumulator =
-        (Torus *)are_all_block_true_buffer->tmp_block_accumulated->ptr;
-
-    // Selects a LUT
-    int_radix_lut<Torus> *lut;
-    if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
-      // is_non_zero_lut_buffer LUT
-      lut = mem_ptr->eq_buffer->is_non_zero_lut;
-    } else {
-      if (chunk_lengths[num_chunks - 1] != max_value) {
-        // LUT needs to be computed
-        uint32_t chunk_length = chunk_lengths[num_chunks - 1];
-        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
-          return x == chunk_length;
-        };
-        generate_device_accumulator<Torus>(
-            streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
-            is_max_value_lut->get_degree(1),
-            is_max_value_lut->get_max_degree(1), glwe_dimension,
-            polynomial_size, message_modulus, carry_modulus,
-            is_equal_to_num_blocks_lut_f);
-
-        Torus *h_lut_indexes = (Torus *)malloc(num_chunks * sizeof(Torus));
-        for (int index = 0; index < num_chunks; index++) {
-          if (index == num_chunks - 1) {
-            h_lut_indexes[index] = 1;
-          } else {
-            h_lut_indexes[index] = 0;
-          }
-        }
-        cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
-                                 h_lut_indexes, num_chunks * sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
-        is_max_value_lut->broadcast_lut(streams, gpu_indexes, 0);
-        cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-        free(h_lut_indexes);
-      }
-      lut = is_max_value_lut;
-    }
-
-    // Applies the LUT
-    if (remaining_blocks == 1) {
-      // In the last iteration we copy the output to the final address
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
-          ksks, 1, lut);
-      return;
-    } else {
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
-          num_chunks, lut);
-    }
-  }
-}
-
 /* This takes an array of lwe ciphertexts, where each is an encryption of
  * either 0 or 1.
  *
@@ -275,72 +168,6 @@ __host__ void are_all_comparisons_block_true(
   }
 }
 
-template <typename Torus>
-__host__ void legacy_is_at_least_one_comparisons_block_true(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
-
-  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
-
-  cuda_memcpy_async_gpu_to_gpu(
-      (Torus *)mem_ptr->tmp_lwe_array_out->ptr, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), streams[0],
-      gpu_indexes[0]);
-
-  uint32_t remaining_blocks = num_radix_blocks;
-  while (remaining_blocks > 0) {
-    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
-
-    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
-    // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = (Torus *)mem_ptr->tmp_lwe_array_out->ptr;
-    auto accumulator = (Torus *)buffer->tmp_block_accumulated->ptr;
-    uint32_t chunk_lengths[num_chunks];
-    auto begin_remaining_blocks = remaining_blocks;
-    for (int i = 0; i < num_chunks; i++) {
-      uint32_t chunk_length =
-          std::min(max_value, begin_remaining_blocks - i * max_value);
-      chunk_lengths[i] = chunk_length;
-      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
-                                   input_blocks, big_lwe_dimension,
-                                   chunk_length);
-
-      accumulator += (big_lwe_dimension + 1);
-      remaining_blocks -= (chunk_length - 1);
-      input_blocks += (big_lwe_dimension + 1) * chunk_length;
-    }
-    accumulator = (Torus *)buffer->tmp_block_accumulated->ptr;
-
-    // Selects a LUT
-    int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
-
-    // Applies the LUT
-    if (remaining_blocks == 1) {
-      // In the last iteration we copy the output to the final address
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
-          ksks, 1, lut);
-      return;
-    } else {
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count,
-          (Torus *)mem_ptr->tmp_lwe_array_out->ptr, accumulator, bsks, ksks,
-          num_chunks, lut);
-    }
-  }
-}
-
 /* This takes an array of lwe ciphertexts, where each is an encryption of
  * either 0 or 1.
  *
@@ -417,69 +244,6 @@ __host__ void is_at_least_one_comparisons_block_true(
   }
 }
 
-template <typename Torus>
-__host__ void legacy_host_compare_with_zero_equality(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, int32_t num_radix_blocks,
-    int_radix_lut<Torus> *zero_comparison) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // The idea is that we will sum chunks of blocks until carries are full
-  // then we compare the sum with 0.
-  //
-  // If all blocks were 0, the sum will be zero
-  // If at least one bock was not zero, the sum won't be zero
-  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t message_max = message_modulus - 1;
-
-  uint32_t num_elements_to_fill_carry = (total_modulus - 1) / message_max;
-
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  int num_sum_blocks = 0;
-  // Accumulator
-  auto sum = lwe_array_out;
-
-  if (num_radix_blocks == 1) {
-    // Just copy
-    cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes,
-                                 streams[0], gpu_indexes[0]);
-    num_sum_blocks = 1;
-  } else {
-    uint32_t remainder_blocks = num_radix_blocks;
-    auto sum_i = sum;
-    auto chunk = lwe_array_in;
-    while (remainder_blocks > 1) {
-      uint32_t chunk_size =
-          std::min(remainder_blocks, num_elements_to_fill_carry);
-
-      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
-                                   big_lwe_dimension, chunk_size);
-
-      num_sum_blocks++;
-      remainder_blocks -= (chunk_size - 1);
-
-      // Update operands
-      chunk += (chunk_size - 1) * big_lwe_size;
-      sum_i += big_lwe_size;
-    }
-  }
-
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
-      zero_comparison);
-  legacy_are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
-                                               lwe_array_out, sum, mem_ptr,
-                                               bsks, ksks, num_sum_blocks);
-}
-
 // FIXME This function should be improved as it outputs a single LWE ciphertext
 //  but requires the output to have enough blocks allocated to compute
 //  intermediate values