From 8314e7d47ce0b923fe77d8e422ea85526dd04cf3 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Mon, 2 Sep 2024 18:13:18 +0200 Subject: [PATCH] chore(gpu): return if chunk_size is 0 --- .../cuda/include/programmable_bootstrap.h | 8 ----- .../pbs/programmable_bootstrap_amortized.cu | 10 ------ .../programmable_bootstrap_cg_multibit.cuh | 4 ++- .../src/pbs/programmable_bootstrap_classic.cu | 19 ----------- .../pbs/programmable_bootstrap_multibit.cuh | 34 +++++++++---------- .../programmable_bootstrap_tbc_multibit.cuh | 4 ++- 6 files changed, 22 insertions(+), 57 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h index 773503bb22..be006cfb77 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h @@ -81,14 +81,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); - -uint64_t get_buffer_size_programmable_bootstrap_amortized_64( - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count); - -uint64_t get_buffer_size_programmable_bootstrap_64( - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu index 5891732459..86ac8dee5d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu @@ -1,15 +1,5 @@ #include "programmable_bootstrap_amortized.cuh" -/* - * Returns the buffer size for 64 bits executions - */ -uint64_t get_buffer_size_programmable_bootstrap_amortized_64( - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count) { - return get_buffer_size_programmable_bootstrap_amortized( - glwe_dimension, polynomial_size, input_lwe_ciphertext_count); -} - /* * This scratch function allocates the necessary amount of data on the GPU for * the amortized PBS on 32 bits inputs, into `buffer`. It also diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index a40fdf8b9d..847f9e03d9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -256,7 +256,7 @@ __host__ void execute_cg_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, int lwe_offset) { + uint32_t lwe_chunk_size, uint32_t lwe_offset) { uint64_t full_dm = get_buffer_size_full_sm_cg_multibit_programmable_bootstrap( @@ -275,6 +275,8 @@ __host__ void execute_cg_external_product_loop( uint32_t chunk_size = std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); + if (chunk_size == 0) + return; auto d_mem = buffer->d_mem_acc_cg; auto keybundle_fft = buffer->keybundle_fft; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index 90403cbb81..1ee92f9c90 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -182,25 +182,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( } #endif -/* - * Returns the buffer size for 64 bits executions - */ -uint64_t get_buffer_size_programmable_bootstrap_64( - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count) { - - if (has_support_to_cuda_programmable_bootstrap_cg( - glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count)) - return get_buffer_size_programmable_bootstrap_cg( - glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count); - else - return get_buffer_size_programmable_bootstrap_cg( - glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count); -} - template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index 501c023b0f..e55c559f4e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -465,10 +465,12 @@ __host__ void execute_compute_keybundle( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, int lwe_offset) { + uint32_t lwe_chunk_size, uint32_t lwe_offset) { uint32_t chunk_size = std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); + if (chunk_size == 0) + return; uint32_t keybundle_size_per_input = lwe_chunk_size * level_count * (glwe_dimension + 1) * @@ -506,14 +508,12 @@ __host__ void execute_compute_keybundle( } template -__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, - Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, - pbs_buffer *buffer, - uint32_t num_samples, uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, int j, int lwe_offset) { +__host__ void execute_step_one( + cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, + pbs_buffer *buffer, uint32_t num_samples, + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) { uint64_t full_sm_accumulate_step_one = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one( @@ -562,14 +562,12 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, } template -__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index, - Torus *lwe_array_out, Torus *lwe_output_indexes, - pbs_buffer *buffer, - uint32_t num_samples, uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - int32_t grouping_factor, uint32_t level_count, - int j, int lwe_offset, uint32_t lwe_chunk_size) { +__host__ void execute_step_two( + cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, + Torus *lwe_output_indexes, pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count, + uint32_t j, uint32_t lwe_offset, uint32_t lwe_chunk_size) { uint64_t full_sm_accumulate_step_two = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two( @@ -627,7 +625,7 @@ __host__ void host_multi_bit_programmable_bootstrap( // Accumulate uint32_t chunk_size = std::min( lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); - for (int j = 0; j < chunk_size; j++) { + for (uint32_t j = 0; j < chunk_size; j++) { execute_step_one( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 30de1d10c8..9a28690c25 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -267,7 +267,7 @@ __host__ void execute_tbc_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, int lwe_offset) { + uint32_t lwe_chunk_size, uint32_t lwe_offset) { auto supports_dsm = supports_distributed_shared_memory_on_multibit_programmable_bootstrap< @@ -294,6 +294,8 @@ __host__ void execute_tbc_external_product_loop( uint32_t chunk_size = std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); + if (chunk_size == 0) + return; auto d_mem = buffer->d_mem_acc_tbc; auto keybundle_fft = buffer->keybundle_fft;