From 8314e7d47ce0b923fe77d8e422ea85526dd04cf3 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Mon, 2 Sep 2024 18:13:18 +0200
Subject: [PATCH] chore(gpu): return if chunk_size is 0

---
 .../cuda/include/programmable_bootstrap.h     |  8 -----
 .../pbs/programmable_bootstrap_amortized.cu   | 10 ------
 .../programmable_bootstrap_cg_multibit.cuh    |  4 ++-
 .../src/pbs/programmable_bootstrap_classic.cu | 19 -----------
 .../pbs/programmable_bootstrap_multibit.cuh   | 34 +++++++++----------
 .../programmable_bootstrap_tbc_multibit.cuh   |  4 ++-
 6 files changed, 22 insertions(+), 57 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
index 773503bb22..be006cfb77 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -81,14 +81,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
 
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                          int8_t **pbs_buffer);
-
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count);
-
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count);
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
index 5891732459..86ac8dee5d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -1,15 +1,5 @@
 #include "programmable_bootstrap_amortized.cuh"
 
-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count) {
-  return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
-      glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
-}
-
 /*
  * This scratch function allocates the necessary amount of data on the GPU for
  * the amortized PBS on 32 bits inputs, into `buffer`. It also
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index a40fdf8b9d..847f9e03d9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -256,7 +256,7 @@ __host__ void execute_cg_external_product_loop(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset) {
 
   uint64_t full_dm =
       get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
@@ -275,6 +275,8 @@ __host__ void execute_cg_external_product_loop(
 
   uint32_t chunk_size =
       std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;
 
   auto d_mem = buffer->d_mem_acc_cg;
   auto keybundle_fft = buffer->keybundle_fft;
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
index 90403cbb81..1ee92f9c90 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -182,25 +182,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
 }
 #endif
 
-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count) {
-
-  if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count))
-    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count);
-  else
-    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count);
-}
-
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_cg(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index 501c023b0f..e55c559f4e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -465,10 +465,12 @@ __host__ void execute_compute_keybundle(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset) {
 
   uint32_t chunk_size =
       std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;
 
   uint32_t keybundle_size_per_input =
       lwe_chunk_size * level_count * (glwe_dimension + 1) *
@@ -506,14 +508,12 @@ __host__ void execute_compute_keybundle(
 }
 
 template <typename Torus, class params>
-__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lut_vector, Torus *lut_vector_indexes,
-                               Torus *lwe_array_in, Torus *lwe_input_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t base_log,
-                               uint32_t level_count, int j, int lwe_offset) {
+__host__ void execute_step_one(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
 
   uint64_t full_sm_accumulate_step_one =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
@@ -562,14 +562,12 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
 }
 
 template <typename Torus, class params>
-__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lwe_array_out, Torus *lwe_output_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size,
-                               int32_t grouping_factor, uint32_t level_count,
-                               int j, int lwe_offset, uint32_t lwe_chunk_size) {
+__host__ void execute_step_two(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
+    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
+    uint32_t j, uint32_t lwe_offset, uint32_t lwe_chunk_size) {
 
   uint64_t full_sm_accumulate_step_two =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
@@ -627,7 +625,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
     // Accumulate
     uint32_t chunk_size = std::min(
         lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-    for (int j = 0; j < chunk_size; j++) {
+    for (uint32_t j = 0; j < chunk_size; j++) {
       execute_step_one<Torus, params>(
           stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 30de1d10c8..9a28690c25 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -267,7 +267,7 @@ __host__ void execute_tbc_external_product_loop(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset) {
 
   auto supports_dsm =
       supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
@@ -294,6 +294,8 @@ __host__ void execute_tbc_external_product_loop(
 
   uint32_t chunk_size =
       std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;
 
   auto d_mem = buffer->d_mem_acc_tbc;
   auto keybundle_fft = buffer->keybundle_fft;