diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index c1d04edf40..3bb815acce 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -132,10 +132,11 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( void cuda_integer_mult_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left, - void const *radix_lwe_right, bool const is_bool_right, void *const *bsks, - void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size, - uint32_t num_blocks); + CudaRadixCiphertextFFI *radix_lwe_out, + CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left, + CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right, + void *const *bsks, void *const *ksks, int8_t *mem_ptr, + uint32_t polynomial_size, uint32_t num_blocks); void cleanup_cuda_integer_mult(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, @@ -375,9 +376,9 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec, - int8_t *mem_ptr, void *const *bsks, void *const *ksks, - uint32_t num_blocks_in_radix); + CudaRadixCiphertextFFI *radix_lwe_out, + CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks, + void *const *ksks); void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, @@ -393,10 +394,10 @@ void scratch_cuda_integer_scalar_mul_kb_64( void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, uint64_t const *decomposed_scalar, + CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks, - void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars); + void *const *ksks, uint32_t polynomial_size, uint32_t message_modulus, + uint32_t num_scalars); void cleanup_cuda_integer_radix_scalar_mul(void *const *streams, uint32_t const *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index e7745d5008..fe4224dc73 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -21,7 +21,7 @@ template __global__ void radix_blocks_rotate_right(Torus *dst, Torus *src, uint32_t value, uint32_t blocks_count, uint32_t lwe_size); -void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in, +void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in, size_t *h_lwe_idx_out, int32_t *h_smart_copy_in, int32_t *h_smart_copy_out, size_t ch_amount, @@ -1161,10 +1161,10 @@ template struct int_overflowing_sub_memory { }; template struct int_sum_ciphertexts_vec_memory { - Torus *new_blocks; - Torus *new_blocks_copy; - Torus *old_blocks; - Torus *small_lwe_vector; + CudaRadixCiphertextFFI *new_blocks; + CudaRadixCiphertextFFI *new_blocks_copy; + CudaRadixCiphertextFFI *old_blocks; + CudaRadixCiphertextFFI *small_lwe_vector; int_radix_params params; int32_t *d_smart_copy_in; @@ -1183,34 +1183,22 @@ template struct int_sum_ciphertexts_vec_memory { int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec; // allocate gpu memory for intermediate buffers - new_blocks = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - new_blocks_copy = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - old_blocks = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - small_lwe_vector = (Torus *)cuda_malloc_async( - max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(new_blocks, 0, - max_pbs_count * (params.big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(new_blocks_copy, 0, - max_pbs_count * (params.big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(old_blocks, 0, - max_pbs_count * (params.big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(small_lwe_vector, 0, - max_pbs_count * (params.small_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); + new_blocks = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async(streams[0], gpu_indexes[0], + new_blocks, max_pbs_count, + params.big_lwe_dimension); + new_blocks_copy = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async(streams[0], gpu_indexes[0], + new_blocks_copy, max_pbs_count, + params.big_lwe_dimension); + old_blocks = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async(streams[0], gpu_indexes[0], + old_blocks, max_pbs_count, + params.big_lwe_dimension); + small_lwe_vector = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async(streams[0], gpu_indexes[0], + small_lwe_vector, max_pbs_count, + params.small_lwe_dimension); d_smart_copy_in = (int32_t *)cuda_malloc_async( max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); @@ -1227,8 +1215,9 @@ template struct int_sum_ciphertexts_vec_memory { uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, - Torus *new_blocks, Torus *old_blocks, - Torus *small_lwe_vector) { + CudaRadixCiphertextFFI *new_blocks, + CudaRadixCiphertextFFI *old_blocks, + CudaRadixCiphertextFFI *small_lwe_vector) { mem_reuse = true; this->params = params; @@ -1238,13 +1227,10 @@ template struct int_sum_ciphertexts_vec_memory { this->new_blocks = new_blocks; this->old_blocks = old_blocks; this->small_lwe_vector = small_lwe_vector; - new_blocks_copy = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(new_blocks_copy, 0, - max_pbs_count * (params.big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); + new_blocks_copy = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async(streams[0], gpu_indexes[0], + new_blocks_copy, max_pbs_count, + params.big_lwe_dimension); d_smart_copy_in = (int32_t *)cuda_malloc_async( max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); @@ -1262,12 +1248,15 @@ template struct int_sum_ciphertexts_vec_memory { cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]); if (!mem_reuse) { - cuda_drop_async(new_blocks, streams[0], gpu_indexes[0]); - cuda_drop_async(old_blocks, streams[0], gpu_indexes[0]); - cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]); + release_radix_ciphertext(streams[0], gpu_indexes[0], new_blocks); + delete new_blocks; + release_radix_ciphertext(streams[0], gpu_indexes[0], old_blocks); + delete old_blocks; + release_radix_ciphertext(streams[0], gpu_indexes[0], small_lwe_vector); + delete small_lwe_vector; } - - cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]); + release_radix_ciphertext(streams[0], gpu_indexes[0], new_blocks_copy); + delete new_blocks_copy; } }; // For sequential algorithm in group propagation @@ -2482,7 +2471,7 @@ template struct int_zero_out_if_buffer { int_radix_params params; - Torus *tmp; + CudaRadixCiphertextFFI *tmp; cudaStream_t *true_streams; cudaStream_t *false_streams; @@ -2495,10 +2484,11 @@ template struct int_zero_out_if_buffer { this->params = params; active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); - Torus big_size = - (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); if (allocate_gpu_memory) { - tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); + tmp = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp, + num_radix_blocks, + params.big_lwe_dimension); // We may use a different stream to allow concurrent operation true_streams = (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); @@ -2512,7 +2502,8 @@ template struct int_zero_out_if_buffer { } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { - cuda_drop_async(tmp, streams[0], gpu_indexes[0]); + release_radix_ciphertext(streams[0], gpu_indexes[0], tmp); + delete tmp; for (uint j = 0; j < active_gpu_count; j++) { cuda_destroy_stream(true_streams[j], gpu_indexes[j]); cuda_destroy_stream(false_streams[j], gpu_indexes[j]); @@ -2523,9 +2514,9 @@ template struct int_zero_out_if_buffer { }; template struct int_mul_memory { - Torus *vector_result_sb; - Torus *block_mul_res; - Torus *small_lwe_vector; + CudaRadixCiphertextFFI *vector_result_sb; + CudaRadixCiphertextFFI *block_mul_res; + CudaRadixCiphertextFFI *small_lwe_vector; int_radix_lut *luts_array; // lsb msb int_radix_lut *zero_out_predicate_lut; @@ -2574,7 +2565,6 @@ template struct int_mul_memory { auto polynomial_size = params.polynomial_size; auto message_modulus = params.message_modulus; auto carry_modulus = params.carry_modulus; - auto lwe_dimension = params.small_lwe_dimension; // 'vector_result_lsb' contains blocks from all possible shifts of // radix_lwe_left excluding zero ciphertext blocks @@ -2587,17 +2577,18 @@ template struct int_mul_memory { int total_block_count = lsb_vector_block_count + msb_vector_block_count; // allocate memory for intermediate buffers - vector_result_sb = (Torus *)cuda_malloc_async( - 2 * total_block_count * (polynomial_size * glwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - block_mul_res = (Torus *)cuda_malloc_async( - 2 * total_block_count * (polynomial_size * glwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - small_lwe_vector = (Torus *)cuda_malloc_async( - total_block_count * (lwe_dimension + 1) * sizeof(Torus), streams[0], - gpu_indexes[0]); + vector_result_sb = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams[0], gpu_indexes[0], vector_result_sb, 2 * total_block_count, + params.big_lwe_dimension); + block_mul_res = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams[0], gpu_indexes[0], block_mul_res, 2 * total_block_count, + params.big_lwe_dimension); + small_lwe_vector = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams[0], gpu_indexes[0], small_lwe_vector, total_block_count, + params.small_lwe_dimension); // create int_radix_lut objects for lsb, msb, message, carry // luts_array -> lut = {lsb_acc, msb_acc} @@ -2658,9 +2649,12 @@ template struct int_mul_memory { return; } - cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]); - cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]); - cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]); + release_radix_ciphertext(streams[0], gpu_indexes[0], vector_result_sb); + delete vector_result_sb; + release_radix_ciphertext(streams[0], gpu_indexes[0], block_mul_res); + delete block_mul_res; + release_radix_ciphertext(streams[0], gpu_indexes[0], small_lwe_vector); + delete small_lwe_vector; luts_array->release(streams, gpu_indexes, gpu_count); sum_ciphertexts_mem->release(streams, gpu_indexes, gpu_count); @@ -4435,7 +4429,7 @@ template struct int_scalar_mul_buffer { int_radix_params params; int_logical_scalar_shift_buffer *logical_scalar_shift_buffer; int_sum_ciphertexts_vec_memory *sum_ciphertexts_vec_mem; - Torus *preshifted_buffer; + CudaRadixCiphertextFFI *preshifted_buffer; CudaRadixCiphertextFFI *all_shifted_buffer; int_sc_prop_memory *sc_prop_mem; bool anticipated_buffers_drop; @@ -4450,25 +4444,21 @@ template struct int_scalar_mul_buffer { if (allocate_gpu_memory) { uint32_t msg_bits = (uint32_t)std::log2(params.message_modulus); - uint32_t lwe_size = params.big_lwe_dimension + 1; - uint32_t lwe_size_bytes = lwe_size * sizeof(Torus); size_t num_ciphertext_bits = msg_bits * num_radix_blocks; //// Contains all shifted values of lhs for shift in range (0..msg_bits) //// The idea is that with these we can create all other shift that are /// in / range (0..total_bits) for free (block rotation) - preshifted_buffer = (Torus *)cuda_malloc_async( - num_ciphertext_bits * lwe_size_bytes, streams[0], gpu_indexes[0]); + preshifted_buffer = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams[0], gpu_indexes[0], preshifted_buffer, num_ciphertext_bits, + params.big_lwe_dimension); all_shifted_buffer = new CudaRadixCiphertextFFI; create_zero_radix_ciphertext_async( streams[0], gpu_indexes[0], all_shifted_buffer, num_ciphertext_bits * num_radix_blocks, params.big_lwe_dimension); - cuda_memset_async(preshifted_buffer, 0, - num_ciphertext_bits * lwe_size_bytes, streams[0], - gpu_indexes[0]); - if (num_ciphertext_bits * num_radix_blocks >= num_radix_blocks + 2) logical_scalar_shift_buffer = new int_logical_scalar_shift_buffer( @@ -4500,7 +4490,8 @@ template struct int_scalar_mul_buffer { release_radix_ciphertext(streams[0], gpu_indexes[0], all_shifted_buffer); delete all_shifted_buffer; if (!anticipated_buffers_drop) { - cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]); + release_radix_ciphertext(streams[0], gpu_indexes[0], preshifted_buffer); + delete preshifted_buffer; logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count); delete (logical_scalar_shift_buffer); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index e5ca6f0ebf..b8ac78e306 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -7,11 +7,20 @@ template __host__ void zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus const *lwe_array_input, - Torus const *lwe_condition, + CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_array_input, + CudaRadixCiphertextFFI const *lwe_condition, int_zero_out_if_buffer *mem_ptr, int_radix_lut *predicate, void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) { + if (lwe_array_out->num_radix_blocks < num_radix_blocks || + lwe_array_input->num_radix_blocks < num_radix_blocks) + PANIC("Cuda error: input or output radix ciphertexts does not have enough " + "blocks") + if (lwe_array_out->lwe_dimension != lwe_array_input->lwe_dimension || + lwe_array_input->lwe_dimension != lwe_condition->lwe_dimension) + PANIC("Cuda error: input and output radix ciphertexts must have the same " + "lwe dimension") cuda_set_device(gpu_indexes[0]); auto params = mem_ptr->params; @@ -21,12 +30,11 @@ __host__ void zero_out_if(cudaStream_t const *streams, host_pack_bivariate_blocks_with_single_block( streams, gpu_indexes, gpu_count, tmp_lwe_array_input, predicate->lwe_indexes_in, lwe_array_input, lwe_condition, - predicate->lwe_indexes_in, params.big_lwe_dimension, - params.message_modulus, num_radix_blocks); + predicate->lwe_indexes_in, params.message_modulus, num_radix_blocks); - legacy_integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks, - ksks, num_radix_blocks, predicate); + ksks, predicate, num_radix_blocks); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index e03a691c6b..c2896b611c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -557,10 +557,20 @@ __global__ void device_pack_bivariate_blocks_with_single_block( template __host__ void host_pack_bivariate_blocks_with_single_block( cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_indexes_out, - Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in, - uint32_t lwe_dimension, uint32_t shift, uint32_t num_radix_blocks) { + uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out, + Torus const *lwe_indexes_out, CudaRadixCiphertextFFI const *lwe_array_1, + CudaRadixCiphertextFFI const *lwe_2, Torus const *lwe_indexes_in, + uint32_t shift, uint32_t num_radix_blocks) { + if (lwe_array_out->num_radix_blocks < num_radix_blocks || + lwe_array_1->num_radix_blocks < num_radix_blocks) + PANIC("Cuda error: input or output radix ciphertexts does not have enough " + "blocks") + if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension || + lwe_array_1->lwe_dimension != lwe_2->lwe_dimension) + PANIC("Cuda error: input and output radix ciphertexts must have the same " + "lwe dimension") + auto lwe_dimension = lwe_array_out->lwe_dimension; cuda_set_device(gpu_indexes[0]); // Left message is shifted int num_blocks = 0, num_threads = 0; @@ -568,7 +578,8 @@ __host__ void host_pack_bivariate_blocks_with_single_block( getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); device_pack_bivariate_blocks_with_single_block <<>>( - lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_2, lwe_indexes_in, + (Torus *)lwe_array_out->ptr, lwe_indexes_out, + (Torus *)lwe_array_1->ptr, (Torus *)lwe_2->ptr, lwe_indexes_in, lwe_dimension, shift, num_radix_blocks); check_cuda_error(cudaGetLastError()); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu index 0733a2ad34..56e28b8211 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu @@ -13,7 +13,7 @@ * as output ids, -1 value as an input id means that zero ciphertext will be * copied on output index. */ -void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in, +void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in, size_t *h_lwe_idx_out, int32_t *h_smart_copy_in, int32_t *h_smart_copy_out, size_t ch_amount, @@ -127,66 +127,53 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( */ void cuda_integer_mult_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left, - void const *radix_lwe_right, bool const is_bool_right, void *const *bsks, - void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size, - uint32_t num_blocks) { + CudaRadixCiphertextFFI *radix_lwe_out, + CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left, + CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right, + void *const *bsks, void *const *ksks, int8_t *mem_ptr, + uint32_t polynomial_size, uint32_t num_blocks) { switch (polynomial_size) { case 256: host_integer_mult_radix_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_left), is_bool_left, - static_cast(radix_lwe_right), is_bool_right, bsks, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 512: host_integer_mult_radix_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_left), is_bool_left, - static_cast(radix_lwe_right), is_bool_right, bsks, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 1024: host_integer_mult_radix_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_left), is_bool_left, - static_cast(radix_lwe_right), is_bool_right, bsks, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 2048: host_integer_mult_radix_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_left), is_bool_left, - static_cast(radix_lwe_right), is_bool_right, bsks, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 4096: host_integer_mult_radix_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_left), is_bool_left, - static_cast(radix_lwe_right), is_bool_right, bsks, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 8192: host_integer_mult_radix_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_left), is_bool_left, - static_cast(radix_lwe_right), is_bool_right, bsks, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 16384: host_integer_mult_radix_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_left), is_bool_left, - static_cast(radix_lwe_right), is_bool_right, bsks, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; default: @@ -226,79 +213,70 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec, - int8_t *mem_ptr, void *const *bsks, void *const *ksks, - uint32_t num_blocks_in_radix) { + CudaRadixCiphertextFFI *radix_lwe_out, + CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks, + void *const *ksks) { auto mem = (int_sum_ciphertexts_vec_memory *)mem_ptr; - int *terms_degree = - (int *)malloc(num_blocks_in_radix * num_radix_in_vec * sizeof(int)); - - for (int i = 0; i < num_radix_in_vec * num_blocks_in_radix; i++) { - terms_degree[i] = mem->params.message_modulus - 1; - } - switch (mem->params.polynomial_size) { case 512: host_integer_partial_sum_ciphertexts_vec_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsks, - (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_vec, bsks, (uint64_t **)(ksks), mem, + radix_lwe_out->num_radix_blocks, + radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks, nullptr); break; case 1024: host_integer_partial_sum_ciphertexts_vec_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsks, - (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_vec, bsks, (uint64_t **)(ksks), mem, + radix_lwe_out->num_radix_blocks, + radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks, nullptr); break; case 2048: host_integer_partial_sum_ciphertexts_vec_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsks, - (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_vec, bsks, (uint64_t **)(ksks), mem, + radix_lwe_out->num_radix_blocks, + radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks, nullptr); break; case 4096: host_integer_partial_sum_ciphertexts_vec_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsks, - (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_vec, bsks, (uint64_t **)(ksks), mem, + radix_lwe_out->num_radix_blocks, + radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks, nullptr); break; case 8192: host_integer_partial_sum_ciphertexts_vec_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsks, - (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_vec, bsks, (uint64_t **)(ksks), mem, + radix_lwe_out->num_radix_blocks, + radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks, nullptr); break; case 16384: host_integer_partial_sum_ciphertexts_vec_kb>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsks, - (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_vec, bsks, (uint64_t **)(ksks), mem, + radix_lwe_out->num_radix_blocks, + radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks, nullptr); break; default: PANIC("Cuda error (integer multiplication): unsupported polynomial size. " "Supported N's are powers of two in the interval [256..16384].") } - - free(terms_degree); } void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 3f32aa77b2..5d443fb296 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -126,9 +126,8 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks, template __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks, Torus *msb_blocks, - uint32_t glwe_dimension, + uint32_t big_lwe_size, uint32_t num_blocks) { - size_t big_lwe_dimension = glwe_dimension * params::degree + 1; size_t big_lwe_id = blockIdx.x; size_t radix_id = big_lwe_id / num_blocks; size_t block_id = big_lwe_id % num_blocks; @@ -138,10 +137,9 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks, bool process_lsb = (radix_id <= block_id); bool process_msb = (radix_id + 1 <= block_id); - auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_dimension]; - auto cur_res_msb_ct = - &result_blocks[num_blocks * num_blocks * big_lwe_dimension + - big_lwe_id * big_lwe_dimension]; + auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_size]; + auto cur_res_msb_ct = &result_blocks[num_blocks * num_blocks * big_lwe_size + + big_lwe_id * big_lwe_size]; Torus *cur_lsb_radix = &lsb_blocks[(2 * num_blocks - radix_id + 1) * radix_id / 2 * (params::degree + 1)]; Torus *cur_msb_radix = (process_msb) @@ -184,12 +182,20 @@ __host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb( template __host__ void host_integer_partial_sum_ciphertexts_vec_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, Torus *radix_lwe_out, Torus *terms, int *terms_degree, - void *const *bsks, uint64_t *const *ksks, + uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out, + CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks, int_sum_ciphertexts_vec_memory *mem_ptr, - uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec, + uint32_t num_radix_blocks, uint32_t num_radix_in_vec, int_radix_lut *reused_lut) { + if (terms->lwe_dimension != radix_lwe_out->lwe_dimension) + PANIC("Cuda error: output and input radix ciphertexts should have the same " + "lwe dimension") + if (num_radix_in_vec > + terms->num_radix_blocks / radix_lwe_out->num_radix_blocks) + PANIC("Cuda error: input vector does not have enough blocks") + if (num_radix_blocks > radix_lwe_out->num_radix_blocks) + PANIC("Cuda error: output does not have enough blocks") auto new_blocks = mem_ptr->new_blocks; auto new_blocks_copy = mem_ptr->new_blocks_copy; auto old_blocks = mem_ptr->old_blocks; @@ -200,7 +206,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( auto message_modulus = mem_ptr->params.message_modulus; auto carry_modulus = mem_ptr->params.carry_modulus; - auto num_blocks = num_blocks_in_radix; auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension; auto big_lwe_size = big_lwe_dimension + 1; auto glwe_dimension = mem_ptr->params.glwe_dimension; @@ -212,25 +217,23 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( uint32_t num_many_lut = 1; uint32_t lut_stride = 0; - if (num_radix_in_vec == 0) + if (terms->num_radix_blocks == 0) return; - if (num_radix_in_vec == 1) { - cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms, - num_blocks_in_radix * big_lwe_size * - sizeof(Torus), - streams[0], gpu_indexes[0]); + if (terms->num_radix_blocks == radix_lwe_out->num_radix_blocks) { + copy_radix_ciphertext_async(streams[0], gpu_indexes[0], + radix_lwe_out, terms); return; } if (old_blocks != terms) { - cuda_memcpy_async_gpu_to_gpu(old_blocks, terms, - num_blocks_in_radix * num_radix_in_vec * - big_lwe_size * sizeof(Torus), - streams[0], gpu_indexes[0]); + copy_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks, + terms); } if (num_radix_in_vec == 2) { - legacy_host_addition( - streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, - &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks); + CudaRadixCiphertextFFI old_blocks_slice; + as_radix_ciphertext_slice(&old_blocks_slice, old_blocks, + num_radix_blocks, 2 * num_radix_blocks); + host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, + &old_blocks_slice, num_radix_blocks); return; } @@ -239,10 +242,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( size_t message_max = message_modulus - 1; size_t chunk_size = (total_modulus - 1) / message_max; - size_t h_lwe_idx_in[r * num_blocks]; - size_t h_lwe_idx_out[r * num_blocks]; - int32_t h_smart_copy_in[r * num_blocks]; - int32_t h_smart_copy_out[r * num_blocks]; + size_t h_lwe_idx_in[terms->num_radix_blocks]; + size_t h_lwe_idx_out[terms->num_radix_blocks]; + int32_t h_smart_copy_in[terms->num_radix_blocks]; + int32_t h_smart_copy_out[terms->num_radix_blocks]; /// Here it is important to query the default max shared memory on device 0 /// instead of cuda_get_max_shared_memory, @@ -261,11 +264,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( if (reused_lut == nullptr) { luts_message_carry = new int_radix_lut( streams, gpu_indexes, gpu_count, mem_ptr->params, 2, - 2 * ch_amount * num_blocks, true); + 2 * ch_amount * num_radix_blocks, true); } else { luts_message_carry = new int_radix_lut( streams, gpu_indexes, gpu_count, mem_ptr->params, 2, - 2 * ch_amount * num_blocks, reused_lut); + 2 * ch_amount * num_radix_blocks, reused_lut); } auto message_acc = luts_message_carry->get_lut(0, 0); auto carry_acc = luts_message_carry->get_lut(0, 1); @@ -291,15 +294,16 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( luts_message_carry->broadcast_lut(streams, gpu_indexes, 0); while (r > 2) { - size_t cur_total_blocks = r * num_blocks; + size_t cur_total_blocks = r * num_radix_blocks; size_t ch_amount = r / chunk_size; if (!ch_amount) ch_amount++; - dim3 add_grid(ch_amount, num_blocks, 1); + dim3 add_grid(ch_amount, num_radix_blocks, 1); cuda_set_device(gpu_indexes[0]); tree_add_chunks<<>>( - new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks); + (Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr, min(r, chunk_size), + big_lwe_size, num_radix_blocks); check_cuda_error(cudaGetLastError()); @@ -309,9 +313,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( size_t sm_copy_count = 0; generate_ids_update_degrees( - terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in, - h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max, - total_count, message_count, carry_count, sm_copy_count); + terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in, + h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size, + message_max, total_count, message_count, carry_count, sm_copy_count); auto lwe_indexes_in = luts_message_carry->lwe_indexes_in; auto lwe_indexes_out = luts_message_carry->lwe_indexes_out; luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0], @@ -326,12 +330,12 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( // inside d_smart_copy_in there are only -1 values // it's fine to call smart_copy with same pointer // as source and destination - cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks, - r * num_blocks * big_lwe_size * sizeof(Torus), - streams[0], gpu_indexes[0]); + copy_radix_ciphertext_slice_async( + streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks, + new_blocks, 0, r * num_radix_blocks); smart_copy<<>>( - new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in, - big_lwe_size); + (Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr, + d_smart_copy_out, d_smart_copy_in, big_lwe_size); check_cuda_error(cudaGetLastError()); if (carry_count > 0) @@ -359,27 +363,27 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( /// because the keyswitch and PBS do not operate on the same number of /// inputs execute_keyswitch_async( - streams, gpu_indexes, 1, small_lwe_vector, lwe_indexes_in, new_blocks, - lwe_indexes_in, ksks, polynomial_size * glwe_dimension, - small_lwe_dimension, mem_ptr->params.ks_base_log, - mem_ptr->params.ks_level, message_count); + streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr, + lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks, + polynomial_size * glwe_dimension, small_lwe_dimension, + mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension execute_pbs_async( - streams, gpu_indexes, 1, new_blocks, lwe_indexes_out, + streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out, luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec, - small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer, - glwe_dimension, small_lwe_dimension, polynomial_size, - mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, - mem_ptr->params.pbs_type, num_many_lut, lut_stride); + (Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks, + luts_message_carry->buffer, glwe_dimension, small_lwe_dimension, + polynomial_size, mem_ptr->params.pbs_base_log, + mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, + total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); multi_gpu_scatter_lwe_async( - streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks, - luts_message_carry->h_lwe_indexes_in, + streams, gpu_indexes, active_gpu_count, new_blocks_vec, + (Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in, luts_message_carry->using_trivial_lwe_indexes, message_count, big_lwe_size); @@ -396,7 +400,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a /// different configuration multi_gpu_gather_lwe_async( - streams, gpu_indexes, gpu_count, small_lwe_vector, + streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr, small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in, luts_message_carry->using_trivial_lwe_indexes, message_count, small_lwe_size); @@ -407,7 +411,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( multi_gpu_scatter_lwe_async( streams, gpu_indexes, gpu_count, small_lwe_vector_vec, - small_lwe_vector, luts_message_carry->h_lwe_indexes_in, + (Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in, luts_message_carry->using_trivial_lwe_indexes, total_count, small_lwe_size); @@ -424,8 +428,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( mem_ptr->params.pbs_type, num_many_lut, lut_stride); multi_gpu_gather_lwe_async( - streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec, - luts_message_carry->h_lwe_indexes_out, + streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr, + lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out, luts_message_carry->using_trivial_lwe_indexes, total_count, big_lwe_size); /// Synchronize all GPUs @@ -433,41 +437,59 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( cuda_synchronize_stream(streams[i], gpu_indexes[i]); } } + Torus lut_indexes[luts_message_carry->num_blocks]; + cuda_memcpy_async_to_cpu(&lut_indexes, + luts_message_carry->get_lut_indexes(0, 0), + luts_message_carry->num_blocks * sizeof(Torus), + streams[0], gpu_indexes[0]); + for (uint i = 0; i < total_count; i++) { + new_blocks->degrees[i] = luts_message_carry->degrees[lut_indexes[i]]; + new_blocks->noise_levels[i] = NoiseLevel::NOMINAL; + } - int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0; - int new_blocks_created = 2 * ch_amount * num_blocks; - copy_size = rem_blocks * big_lwe_size * sizeof(Torus); + int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0; + int new_blocks_created = 2 * ch_amount * num_radix_blocks; - auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size]; - auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size]; - cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, streams[0], - gpu_indexes[0]); + if (rem_blocks > 0) + copy_radix_ciphertext_slice_async( + streams[0], gpu_indexes[0], new_blocks, new_blocks_created, + new_blocks_created + rem_blocks, old_blocks, + cur_total_blocks - rem_blocks, cur_total_blocks); std::swap(new_blocks, old_blocks); - r = (new_blocks_created + rem_blocks) / num_blocks; + r = (new_blocks_created + rem_blocks) / num_radix_blocks; } luts_message_carry->release(streams, gpu_indexes, gpu_count); delete (luts_message_carry); - legacy_host_addition( - streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, - &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks); + CudaRadixCiphertextFFI old_blocks_slice; + as_radix_ciphertext_slice(&old_blocks_slice, old_blocks, + num_radix_blocks, 2 * num_radix_blocks); + host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, + &old_blocks_slice, num_radix_blocks); } template __host__ void host_integer_mult_radix_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left, - bool const is_bool_left, uint64_t const *radix_lwe_right, - bool const is_bool_right, void *const *bsks, uint64_t *const *ksks, - int_mul_memory *mem_ptr, uint32_t num_blocks) { - - auto glwe_dimension = mem_ptr->params.glwe_dimension; - auto polynomial_size = mem_ptr->params.polynomial_size; + uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out, + CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left, + CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right, + void *const *bsks, uint64_t *const *ksks, int_mul_memory *mem_ptr, + uint32_t num_blocks) { + + if (radix_lwe_out->lwe_dimension != radix_lwe_left->lwe_dimension || + radix_lwe_right->lwe_dimension != radix_lwe_left->lwe_dimension) + PANIC("Cuda error: input and output lwe dimensions should be the same") + if (radix_lwe_out->num_radix_blocks < num_blocks || + radix_lwe_left->num_radix_blocks < num_blocks || + radix_lwe_right->num_radix_blocks < num_blocks) + PANIC("Cuda error: input or output does not have enough radix blocks") auto lwe_dimension = mem_ptr->params.small_lwe_dimension; auto message_modulus = mem_ptr->params.message_modulus; auto carry_modulus = mem_ptr->params.carry_modulus; - int big_lwe_dimension = glwe_dimension * polynomial_size; + int big_lwe_dimension = radix_lwe_left->lwe_dimension; + int big_lwe_size = big_lwe_dimension + 1; if (is_bool_right) { zero_out_if(streams, gpu_indexes, gpu_count, radix_lwe_out, @@ -529,57 +551,55 @@ __host__ void host_integer_mult_radix_kb( // 2 * (glwe_dimension + 1) * polynomial_size auto luts_array = mem_ptr->luts_array; - auto vector_result_lsb = &vector_result_sb[0]; - auto vector_result_msb = - &vector_result_sb[lsb_vector_block_count * - (polynomial_size * glwe_dimension + 1)]; + auto vector_result_lsb = vector_result_sb; + CudaRadixCiphertextFFI vector_result_msb; + as_radix_ciphertext_slice(&vector_result_msb, vector_result_lsb, + lsb_vector_block_count, + vector_result_lsb->num_radix_blocks); - auto vector_lsb_rhs = &block_mul_res[0]; - auto vector_msb_rhs = &block_mul_res[lsb_vector_block_count * - (polynomial_size * glwe_dimension + 1)]; + auto vector_lsb_rhs = block_mul_res; + CudaRadixCiphertextFFI vector_msb_rhs; + as_radix_ciphertext_slice(&vector_msb_rhs, block_mul_res, + lsb_vector_block_count, + block_mul_res->num_radix_blocks); dim3 grid(lsb_vector_block_count, 1, 1); dim3 thds(params::degree / params::opt, 1, 1); cuda_set_device(gpu_indexes[0]); all_shifted_lhs_rhs<<>>( - radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right, - vector_lsb_rhs, vector_msb_rhs, num_blocks); + (Torus *)radix_lwe_left->ptr, (Torus *)vector_result_lsb->ptr, + (Torus *)vector_result_msb.ptr, (Torus *)radix_lwe_right->ptr, + (Torus *)vector_lsb_rhs->ptr, (Torus *)vector_msb_rhs.ptr, num_blocks); check_cuda_error(cudaGetLastError()); - legacy_integer_radix_apply_bivariate_lookup_table_kb( + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res, - vector_result_sb, bsks, ksks, total_block_count, luts_array, + vector_result_sb, bsks, ksks, luts_array, total_block_count, luts_array->params.message_modulus); - vector_result_lsb = &block_mul_res[0]; - vector_result_msb = &block_mul_res[lsb_vector_block_count * - (polynomial_size * glwe_dimension + 1)]; + vector_result_lsb = block_mul_res; + as_radix_ciphertext_slice(&vector_result_msb, block_mul_res, + lsb_vector_block_count, + block_mul_res->num_radix_blocks); cuda_set_device(gpu_indexes[0]); fill_radix_from_lsb_msb <<>>(vector_result_sb, vector_result_lsb, vector_result_msb, - glwe_dimension, num_blocks); + streams[0]>>>( + (Torus *)vector_result_sb->ptr, (Torus *)vector_result_lsb->ptr, + (Torus *)vector_result_msb.ptr, big_lwe_size, num_blocks); check_cuda_error(cudaGetLastError()); - int terms_degree[2 * num_blocks * num_blocks]; for (int i = 0; i < num_blocks * num_blocks; i++) { size_t r_id = i / num_blocks; size_t b_id = i % num_blocks; - terms_degree[i] = (b_id >= r_id) ? message_modulus - 1 : 0; + vector_result_sb->degrees[i] = (b_id >= r_id) ? message_modulus - 1 : 0; } - auto terms_degree_msb = &terms_degree[num_blocks * num_blocks]; - for (int i = 0; i < num_blocks * num_blocks; i++) { - size_t r_id = i / num_blocks; - size_t b_id = i % num_blocks; - terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0; - } - host_integer_partial_sum_ciphertexts_vec_kb( - streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, - terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks, - 2 * num_blocks, mem_ptr->luts_array); + streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks, + ksks, mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks, + mem_ptr->luts_array); uint32_t block_modulus = message_modulus * carry_modulus; uint32_t num_bits_in_block = log2_int(block_modulus); @@ -587,9 +607,9 @@ __host__ void host_integer_mult_radix_kb( auto scp_mem_ptr = mem_ptr->sc_prop_mem; uint32_t requested_flag = outputFlag::FLAG_NONE; uint32_t uses_carry = 0; - legacy_host_propagate_single_carry( + host_propagate_single_carry( streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr, - scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry); + scp_mem_ptr, bsks, ksks, requested_flag, uses_carry); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh index fc553578ed..5f00a67b5d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh @@ -139,8 +139,8 @@ void set_zero_radix_ciphertext_slice_async(cudaStream_t const stream, if (radix->num_radix_blocks < end_lwe_index - start_lwe_index) PANIC("Cuda error: input radix should have more blocks than the specified " "range") - if (start_lwe_index >= end_lwe_index) - PANIC("Cuda error: slice range should be non negative") + if (start_lwe_index > end_lwe_index) + PANIC("Cuda error: slice range should be positive") auto lwe_size = radix->lwe_dimension + 1; auto num_blocks_to_set = end_lwe_index - start_lwe_index; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu index 7cbfd566d9..04ce8f164c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu @@ -21,65 +21,53 @@ void scratch_cuda_integer_scalar_mul_kb_64( void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, uint64_t const *decomposed_scalar, + CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set, int8_t *mem, void *const *bsks, - void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) { + void *const *ksks, uint32_t polynomial_size, uint32_t message_modulus, + uint32_t num_scalars) { switch (polynomial_size) { case 512: host_integer_scalar_mul_radix>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), decomposed_scalar, - has_at_least_one_set, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, + decomposed_scalar, has_at_least_one_set, reinterpret_cast *>(mem), bsks, - (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, - num_scalars); + (uint64_t **)(ksks), message_modulus, num_scalars); break; case 1024: host_integer_scalar_mul_radix>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), decomposed_scalar, - has_at_least_one_set, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, + decomposed_scalar, has_at_least_one_set, reinterpret_cast *>(mem), bsks, - (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, - num_scalars); + (uint64_t **)(ksks), message_modulus, num_scalars); break; case 2048: host_integer_scalar_mul_radix>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), decomposed_scalar, - has_at_least_one_set, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, + decomposed_scalar, has_at_least_one_set, reinterpret_cast *>(mem), bsks, - (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, - num_scalars); + (uint64_t **)(ksks), message_modulus, num_scalars); break; case 4096: host_integer_scalar_mul_radix>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), decomposed_scalar, - has_at_least_one_set, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, + decomposed_scalar, has_at_least_one_set, reinterpret_cast *>(mem), bsks, - (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, - num_scalars); + (uint64_t **)(ksks), message_modulus, num_scalars); break; case 8192: host_integer_scalar_mul_radix>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), decomposed_scalar, - has_at_least_one_set, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, + decomposed_scalar, has_at_least_one_set, reinterpret_cast *>(mem), bsks, - (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, - num_scalars); + (uint64_t **)(ksks), message_modulus, num_scalars); break; case 16384: host_integer_scalar_mul_radix>( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), decomposed_scalar, - has_at_least_one_set, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, + decomposed_scalar, has_at_least_one_set, reinterpret_cast *>(mem), bsks, - (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, - num_scalars); + (uint64_t **)(ksks), message_modulus, num_scalars); break; default: PANIC("Cuda error (scalar multiplication): unsupported polynomial size. " diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh index f801e8bb29..d18f158377 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh @@ -42,81 +42,86 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb( template __host__ void host_integer_scalar_mul_radix( cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, T *lwe_array, T const *decomposed_scalar, - T const *has_at_least_one_set, int_scalar_mul_buffer *mem, - void *const *bsks, T *const *ksks, uint32_t input_lwe_dimension, - uint32_t message_modulus, uint32_t num_radix_blocks, uint32_t num_scalars) { + uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, + T const *decomposed_scalar, T const *has_at_least_one_set, + int_scalar_mul_buffer *mem, void *const *bsks, T *const *ksks, + uint32_t message_modulus, uint32_t num_scalars) { + auto num_radix_blocks = lwe_array->num_radix_blocks; // lwe_size includes the presence of the body // whereas lwe_dimension is the number of elements in the mask - uint32_t lwe_size = input_lwe_dimension + 1; - uint32_t lwe_size_bytes = lwe_size * sizeof(T); uint32_t msg_bits = log2_int(message_modulus); uint32_t num_ciphertext_bits = msg_bits * num_radix_blocks; - T *preshifted_buffer = mem->preshifted_buffer; - T *all_shifted_buffer = (T *)mem->all_shifted_buffer->ptr; + auto preshifted_buffer = mem->preshifted_buffer; + auto all_shifted_buffer = mem->all_shifted_buffer; for (size_t shift_amount = 0; shift_amount < msg_bits; shift_amount++) { - T *ptr = preshifted_buffer + shift_amount * lwe_size * num_radix_blocks; + CudaRadixCiphertextFFI shift_input; + as_radix_ciphertext_slice(&shift_input, preshifted_buffer, + shift_amount * num_radix_blocks, + preshifted_buffer->num_radix_blocks); if (has_at_least_one_set[shift_amount] == 1) { - cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array, - lwe_size_bytes * num_radix_blocks, - streams[0], gpu_indexes[0]); - legacy_host_integer_radix_logical_scalar_shift_kb_inplace( - streams, gpu_indexes, gpu_count, ptr, shift_amount, + copy_radix_ciphertext_slice_async(streams[0], gpu_indexes[0], + &shift_input, 0, num_radix_blocks, + lwe_array, 0, num_radix_blocks); + host_integer_radix_logical_scalar_shift_kb_inplace( + streams, gpu_indexes, gpu_count, &shift_input, shift_amount, mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks); } else { // create trivial assign for value = 0 - cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, streams[0], - gpu_indexes[0]); + set_zero_radix_ciphertext_slice_async( + streams[0], gpu_indexes[0], &shift_input, 0, num_radix_blocks); } } size_t j = 0; for (size_t i = 0; i < min(num_scalars, num_ciphertext_bits); i++) { if (decomposed_scalar[i] == 1) { // Perform a block shift - T *preshifted_radix_ct = - preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size; - T *block_shift_buffer = - all_shifted_buffer + j * num_radix_blocks * lwe_size; - legacy_host_radix_blocks_rotate_right( - streams, gpu_indexes, gpu_count, block_shift_buffer, - preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size); + CudaRadixCiphertextFFI preshifted_radix_ct; + as_radix_ciphertext_slice(&preshifted_radix_ct, preshifted_buffer, + (i % msg_bits) * num_radix_blocks, + preshifted_buffer->num_radix_blocks); + CudaRadixCiphertextFFI block_shift_buffer; + as_radix_ciphertext_slice(&block_shift_buffer, all_shifted_buffer, + j * num_radix_blocks, + all_shifted_buffer->num_radix_blocks); + host_radix_blocks_rotate_right( + streams, gpu_indexes, gpu_count, &block_shift_buffer, + &preshifted_radix_ct, i / msg_bits, num_radix_blocks); // create trivial assign for value = 0 - cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes, - streams[0], gpu_indexes[0]); + set_zero_radix_ciphertext_slice_async( + streams[0], gpu_indexes[0], &block_shift_buffer, 0, i / msg_bits); j++; } } cuda_synchronize_stream(streams[0], gpu_indexes[0]); if (mem->anticipated_buffers_drop) { - cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]); + release_radix_ciphertext(streams[0], gpu_indexes[0], preshifted_buffer); + delete preshifted_buffer; mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count); delete (mem->logical_scalar_shift_buffer); } if (j == 0) { // lwe array = 0 - cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes, - streams[0], gpu_indexes[0]); + set_zero_radix_ciphertext_slice_async(streams[0], gpu_indexes[0], + lwe_array, 0, num_radix_blocks); } else { - int terms_degree[j * num_radix_blocks]; for (int i = 0; i < j * num_radix_blocks; i++) { - terms_degree[i] = message_modulus - 1; + all_shifted_buffer->degrees[i] = message_modulus - 1; } host_integer_partial_sum_ciphertexts_vec_kb( - streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, - terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem, - num_radix_blocks, j, nullptr); + streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks, + ksks, mem->sum_ciphertexts_vec_mem, num_radix_blocks, j, nullptr); auto scp_mem_ptr = mem->sc_prop_mem; uint32_t requested_flag = outputFlag::FLAG_NONE; uint32_t uses_carry = 0; - legacy_host_propagate_single_carry( - streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr, - scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, uses_carry); + host_propagate_single_carry(streams, gpu_indexes, gpu_count, lwe_array, + nullptr, nullptr, scp_mem_ptr, bsks, ksks, + requested_flag, uses_carry); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu index 6ca372bae3..ac58cab4d8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu @@ -32,7 +32,7 @@ void cuda_integer_radix_logical_scalar_shift_kb_64_inplace( host_integer_radix_logical_scalar_shift_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift, (int_logical_scalar_shift_buffer *)mem_ptr, bsks, - (uint64_t **)(ksks)); + (uint64_t **)(ksks), lwe_array->num_radix_blocks); } void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index 6676a6c55d..974442b571 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -119,9 +119,10 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int_logical_scalar_shift_buffer *mem, void *const *bsks, - Torus *const *ksks) { + Torus *const *ksks, uint32_t num_blocks) { - auto num_blocks = lwe_array->num_radix_blocks; + if (lwe_array->num_radix_blocks < num_blocks) + PANIC("Cuda error: input does not have enough blocks") auto params = mem->params; auto message_modulus = params.message_modulus; @@ -147,9 +148,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( num_blocks); // create trivial assign for value = 0 - if (rotations > 0) - set_zero_radix_ciphertext_slice_async( - streams[0], gpu_indexes[0], &rotated_buffer, 0, rotations); + set_zero_radix_ciphertext_slice_async(streams[0], gpu_indexes[0], + &rotated_buffer, 0, rotations); copy_radix_ciphertext_slice_async(streams[0], gpu_indexes[0], lwe_array, 0, num_blocks, &rotated_buffer, 0, num_blocks); @@ -183,11 +183,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( // rotate left as the blocks are from LSB to MSB // create trivial assign for value = 0 - if (rotations > 0) { - set_zero_radix_ciphertext_slice_async( - streams[0], gpu_indexes[0], &rotated_buffer, num_blocks - rotations, - num_blocks); - } + set_zero_radix_ciphertext_slice_async( + streams[0], gpu_indexes[0], &rotated_buffer, num_blocks - rotations, + num_blocks); copy_radix_ciphertext_slice_async(streams[0], gpu_indexes[0], lwe_array, 0, num_blocks, &rotated_buffer, 0, num_blocks); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index 52f7ef44eb..d7e8f48fde 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -110,9 +110,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( rotated_input, input_bits_b, rotations, total_nb_bits); - if (rotations > 0) - set_zero_radix_ciphertext_slice_async( - streams[0], gpu_indexes[0], rotated_input, 0, rotations); + set_zero_radix_ciphertext_slice_async(streams[0], gpu_indexes[0], + rotated_input, 0, rotations); break; case RIGHT_SHIFT: // rotate left as the blocks are from LSB to MSB @@ -130,10 +129,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( &last_bit, 0, 1); } else { - if (rotations > 0) - set_zero_radix_ciphertext_slice_async( - streams[0], gpu_indexes[0], rotated_input, - total_nb_bits - rotations, total_nb_bits); + set_zero_radix_ciphertext_slice_async( + streams[0], gpu_indexes[0], rotated_input, + total_nb_bits - rotations, total_nb_bits); } break; case LEFT_ROTATE: diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 8cdc924a41..8615b97721 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -362,10 +362,10 @@ unsafe extern "C" { streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe_out: *mut ffi::c_void, - radix_lwe_left: *const ffi::c_void, + radix_lwe_out: *mut CudaRadixCiphertextFFI, + radix_lwe_left: *const CudaRadixCiphertextFFI, is_bool_left: bool, - radix_lwe_right: *const ffi::c_void, + radix_lwe_right: *const CudaRadixCiphertextFFI, is_bool_right: bool, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, @@ -909,13 +909,11 @@ unsafe extern "C" { streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe_out: *mut ffi::c_void, - radix_lwe_vec: *mut ffi::c_void, - num_radix_in_vec: u32, + radix_lwe_out: *mut CudaRadixCiphertextFFI, + radix_lwe_vec: *mut CudaRadixCiphertextFFI, mem_ptr: *mut i8, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, - num_blocks_in_radix: u32, ); } unsafe extern "C" { @@ -952,16 +950,14 @@ unsafe extern "C" { streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - lwe_array: *mut ffi::c_void, + lwe_array: *mut CudaRadixCiphertextFFI, decomposed_scalar: *const u64, has_at_least_one_set: *const u64, mem_ptr: *mut i8, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, - lwe_dimension: u32, polynomial_size: u32, message_modulus: u32, - num_blocks: u32, num_scalars: u32, ); } diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs index cc36e6f7ce..e3090fd8c5 100644 --- a/tfhe/src/integer/gpu/ciphertext/info.rs +++ b/tfhe/src/integer/gpu/ciphertext/info.rs @@ -1,5 +1,3 @@ -use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto}; -use crate::integer::server_key::TwosComplementNegation; use crate::shortint::ciphertext::{Degree, NoiseLevel}; use crate::shortint::{CarryModulus, MessageModulus, PBSOrder}; @@ -24,84 +22,6 @@ pub struct CudaRadixCiphertextInfo { } impl CudaRadixCiphertextInfo { - // Creates an iterator that return decomposed blocks of the negated - // value of `scalar` - // - // Returns - // - `None` if scalar is zero - // - `Some` if scalar is non-zero - // - fn create_negated_block_decomposer(&self, scalar: T) -> Option> - where - T: TwosComplementNegation + DecomposableInto, - { - if scalar == T::ZERO { - return None; - } - let message_modulus = self.blocks.first().unwrap().message_modulus; - let bits_in_message = message_modulus.0.ilog2(); - assert!(bits_in_message <= u8::BITS); - - // The whole idea behind this iterator we construct is: - // - to support combos of parameters and num blocks for which the total number of bits is - // not a multiple of T::BITS - // - // - Support subtraction in the case the T::BITS is lower than the target ciphertext bits. - // In clear rust this would require an upcast, to support that we have to do a few things - - let neg_scalar = scalar.twos_complement_negation(); - - // If we had upcasted the scalar, its msb would be zeros (0) - // then they would become ones (1) after the bitwise_not (!). - // The only case where these msb could become 0 after the addition - // is if scalar == T::ZERO (=> !T::ZERO == T::MAX => T::MAX + 1 == overflow), - // but this case has been handled earlier. - let padding_bit = 1u32; // To handle when bits is not a multiple of T::BITS - // All bits of message set to one - let pad_block = (1 << bits_in_message as u8) - 1; - - let decomposer = BlockDecomposer::with_padding_bit( - neg_scalar, - bits_in_message, - T::cast_from(padding_bit), - ) - .iter_as::() - .chain(std::iter::repeat(pad_block)); - Some(decomposer) - } - - pub(crate) fn after_mul(&self) -> Self { - Self { - blocks: self - .blocks - .iter() - .map(|left| CudaBlockInfo { - degree: Degree::new(left.message_modulus.0 - 1), - message_modulus: left.message_modulus, - carry_modulus: left.carry_modulus, - pbs_order: left.pbs_order, - noise_level: NoiseLevel::NOMINAL, - }) - .collect(), - } - } - - pub(crate) fn after_ilog2(&self) -> Self { - Self { - blocks: self - .blocks - .iter() - .map(|info| CudaBlockInfo { - degree: Degree::new(info.message_modulus.0 - 1), - message_modulus: info.message_modulus, - carry_modulus: info.carry_modulus, - pbs_order: info.pbs_order, - noise_level: NoiseLevel::NOMINAL, - }) - .collect(), - } - } - pub(crate) fn after_div_rem(&self) -> Self { Self { blocks: self @@ -149,47 +69,6 @@ impl CudaRadixCiphertextInfo { } } - pub(crate) fn after_scalar_mul(&self) -> Self { - Self { - blocks: self - .blocks - .iter() - .map(|info| CudaBlockInfo { - degree: Degree::new(info.message_modulus.0 - 1), - message_modulus: info.message_modulus, - carry_modulus: info.carry_modulus, - pbs_order: info.pbs_order, - noise_level: NoiseLevel::NOMINAL, - }) - .collect(), - } - } - - pub(crate) fn after_scalar_sub(&self, scalar: T) -> Self - where - T: TwosComplementNegation + DecomposableInto, - { - let Some(decomposer) = self.create_negated_block_decomposer(scalar) else { - // subtraction by zero - return self.clone(); - }; - - Self { - blocks: self - .blocks - .iter() - .zip(decomposer) - .map(|(left, scalar_block)| CudaBlockInfo { - degree: Degree::new(left.degree.get() + u64::from(scalar_block)), - message_modulus: left.message_modulus, - carry_modulus: left.carry_modulus, - pbs_order: left.pbs_order, - noise_level: left.noise_level, - }) - .collect(), - } - } - pub(crate) fn after_bitnot(&self) -> Self { Self { blocks: self diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs index f304704e1e..453955c8e5 100644 --- a/tfhe/src/integer/gpu/ciphertext/mod.rs +++ b/tfhe/src/integer/gpu/ciphertext/mod.rs @@ -171,6 +171,23 @@ impl CudaRadixCiphertext { }) .collect() } + pub fn from_radix_ciphertext_vec( + list: &[T], + streams: &CudaStreams, + ) -> Self { + let lwes = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list( + list.iter().map(|ciphertext| &ciphertext.as_ref().d_blocks), + streams, + ); + let info = CudaRadixCiphertextInfo { + blocks: list + .iter() + .flat_map(|ciphertext| ciphertext.as_ref().info.blocks.iter()) + .copied() + .collect::>(), + }; + Self::new(lwes, info) + } } impl CudaUnsignedRadixCiphertext { diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index bfda9e34c6..4b9a12f5d5 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -279,7 +279,7 @@ pub unsafe fn scalar_addition_integer_radix_assign_async( /// is required pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async( streams: &CudaStreams, - lwe_array: &mut CudaVec, + lwe_array: &mut CudaRadixCiphertext, decomposed_scalar: &[T], has_at_least_one_set: &[T], bootstrapping_key: &CudaVec, @@ -293,14 +293,13 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async(), has_at_least_one_set.as_ptr().cast::(), mem_ptr, bootstrapping_key.ptr.as_ptr(), keyswitch_key.ptr.as_ptr(), - (glwe_dimension.0 * polynomial_size.0) as u32, polynomial_size.0 as u32, message_modulus.0 as u32, - num_blocks, num_scalars, ); @@ -357,6 +366,7 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async( streams: &CudaStreams, - radix_lwe_left: &mut CudaVec, + radix_lwe_left: &mut CudaRadixCiphertext, is_boolean_left: bool, - radix_lwe_right: &CudaVec, + radix_lwe_right: &CudaRadixCiphertext, is_boolean_right: bool, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, @@ -613,12 +623,12 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async( streams: &CudaStreams, - result: &mut CudaVec, - radix_list: &mut CudaVec, + result: &mut CudaRadixCiphertext, + radix_list: &mut CudaRadixCiphertext, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, message_modulus: MessageModulus, @@ -2747,12 +2798,12 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async< ) { assert_eq!( streams.gpu_indexes[0], - result.gpu_index(0), + result.d_blocks.0.d_vec.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - radix_list.gpu_index(0), + radix_list.d_blocks.0.d_vec.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( @@ -2766,6 +2817,22 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async< "GPU error: all data should reside on the same GPU." ); let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + let mut result_degrees = result.info.blocks.iter().map(|b| b.degree.0).collect(); + let mut result_noise_levels = result.info.blocks.iter().map(|b| b.noise_level.0).collect(); + let mut cuda_ffi_result = + prepare_cuda_radix_ffi(result, &mut result_degrees, &mut result_noise_levels); + let mut radix_list_degrees = radix_list.info.blocks.iter().map(|b| b.degree.0).collect(); + let mut radix_list_noise_levels = radix_list + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut cuda_ffi_radix_list = prepare_cuda_radix_ffi( + radix_list, + &mut radix_list_degrees, + &mut radix_list_noise_levels, + ); scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( streams.ptr.as_ptr(), streams.gpu_indexes_ptr(), @@ -2790,13 +2857,11 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async< streams.ptr.as_ptr(), streams.gpu_indexes_ptr(), streams.len() as u32, - result.as_mut_c_ptr(0), - radix_list.as_mut_c_ptr(0), - num_radixes, + &mut cuda_ffi_result, + &mut cuda_ffi_radix_list, mem_ptr, bootstrapping_key.ptr.as_ptr(), keyswitch_key.ptr.as_ptr(), - num_blocks, ); cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( streams.ptr.as_ptr(), @@ -2804,6 +2869,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async< streams.len() as u32, std::ptr::addr_of_mut!(mem_ptr), ); + update_noise_degree(result, &cuda_ffi_result); } #[allow(clippy::too_many_arguments)] diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs index 2ba76c13ee..86336eead1 100644 --- a/tfhe/src/integer/gpu/server_key/radix/add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/add.rs @@ -1,9 +1,9 @@ -use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::LweBskGroupingFactor; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use crate::integer::gpu::ciphertext::{ - CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext, + CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext, + CudaUnsignedRadixCiphertext, }; use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; use crate::integer::gpu::{ @@ -242,6 +242,7 @@ impl CudaServerKey { streams, 0, ); + result.as_mut().info = ciphertexts[0].as_ref().info.clone(); if ciphertexts.len() == 1 { return; } @@ -262,19 +263,14 @@ impl CudaServerKey { let radix_count_in_vec = ciphertexts.len(); - let mut terms = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list( - ciphertexts - .iter() - .map(|ciphertext| &ciphertext.as_ref().d_blocks), - streams, - ); + let mut terms = CudaRadixCiphertext::from_radix_ciphertext_vec(ciphertexts, streams); match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async( streams, - &mut result.as_mut().d_blocks.0.d_vec, - &mut terms.0.d_vec, + result.as_mut(), + &mut terms, &d_bsk.d_vec, &self.key_switching_key.d_vec, self.message_modulus, @@ -297,8 +293,8 @@ impl CudaServerKey { CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async( streams, - &mut result.as_mut().d_blocks.0.d_vec, - &mut terms.0.d_vec, + result.as_mut(), + &mut terms, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, self.message_modulus, diff --git a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs index 2acf8ce9fe..9aa6571c20 100644 --- a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs +++ b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs @@ -162,6 +162,22 @@ impl CudaServerKey { .as_mut_slice((i * lwe_size)..((i + 1) * lwe_size), 0) .unwrap(); dest_slice.copy_from_gpu_async(&src_slice, streams, 0); + for b in new_item.ciphertext.info.blocks.iter_mut() { + b.degree = leading_count_per_blocks + .as_ref() + .info + .blocks + .get(i) + .unwrap() + .degree; + b.noise_level = leading_count_per_blocks + .as_ref() + .info + .blocks + .get(i) + .unwrap() + .noise_level; + } cts.push(new_item); } @@ -404,6 +420,22 @@ impl CudaServerKey { .as_mut_slice((i * lwe_size)..((i + 1) * lwe_size), 0) .unwrap(); dest_slice.copy_from_gpu_async(&src_slice, streams, 0); + for b in new_item.ciphertext.info.blocks.iter_mut() { + b.degree = leading_zeros_per_blocks + .as_ref() + .info + .blocks + .get(i) + .unwrap() + .degree; + b.noise_level = leading_zeros_per_blocks + .as_ref() + .info + .blocks + .get(i) + .unwrap() + .noise_level; + } cts.push(new_item); } @@ -490,9 +522,7 @@ impl CudaServerKey { let result = self.sum_ciphertexts_async(ciphertexts, streams).unwrap(); - let mut result_cast = self.cast_to_unsigned_async(result, counter_num_blocks, streams); - result_cast.as_mut().info = ct.as_ref().info.after_ilog2(); - result_cast + self.cast_to_unsigned_async(result, counter_num_blocks, streams) } /// Returns the number of trailing zeros in the binary representation of `ct` diff --git a/tfhe/src/integer/gpu/server_key/radix/mul.rs b/tfhe/src/integer/gpu/server_key/radix/mul.rs index 5f199ed990..3541647003 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mul.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mul.rs @@ -81,9 +81,9 @@ impl CudaServerKey { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_mul_integer_radix_kb_assign_async( streams, - &mut ct_left.as_mut().d_blocks.0.d_vec, + ct_left.as_mut(), is_boolean_left, - &ct_right.as_ref().d_blocks.0.d_vec, + ct_right.as_ref(), is_boolean_right, &d_bsk.d_vec, &self.key_switching_key.d_vec, @@ -104,9 +104,9 @@ impl CudaServerKey { CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_mul_integer_radix_kb_assign_async( streams, - &mut ct_left.as_mut().d_blocks.0.d_vec, + ct_left.as_mut(), is_boolean_left, - &ct_right.as_ref().d_blocks.0.d_vec, + ct_right.as_ref(), is_boolean_right, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, @@ -125,8 +125,6 @@ impl CudaServerKey { ); } } - - ct_left.as_mut().info = ct_left.as_ref().info.after_mul(); } pub fn unchecked_mul_assign( diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs index 0e2ba4633f..c1515eb688 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs @@ -116,7 +116,7 @@ impl CudaServerKey { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_scalar_mul_integer_radix_kb_async( streams, - &mut ct.as_mut().d_blocks.0.d_vec, + ct.as_mut(), decomposed_scalar.as_slice(), has_at_least_one_set.as_slice(), &d_bsk.d_vec, @@ -132,7 +132,6 @@ impl CudaServerKey { d_bsk.decomp_level_count, self.key_switching_key.decomposition_base_log(), self.key_switching_key.decomposition_level_count(), - num_blocks as u32, decomposed_scalar.len() as u32, PBSType::Classical, LweBskGroupingFactor(0), @@ -141,7 +140,7 @@ impl CudaServerKey { CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_scalar_mul_integer_radix_kb_async( streams, - &mut ct.as_mut().d_blocks.0.d_vec, + ct.as_mut(), decomposed_scalar.as_slice(), has_at_least_one_set.as_slice(), &d_multibit_bsk.d_vec, @@ -157,15 +156,12 @@ impl CudaServerKey { d_multibit_bsk.decomp_level_count, self.key_switching_key.decomposition_base_log(), self.key_switching_key.decomposition_level_count(), - num_blocks as u32, decomposed_scalar.len() as u32, PBSType::MultiBit, d_multibit_bsk.grouping_factor, ); } } - - ct.as_mut().info = ct.as_ref().info.after_scalar_mul(); } pub fn unchecked_scalar_mul_assign( diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs index 035ac19484..a45d67e072 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs @@ -78,7 +78,6 @@ impl CudaServerKey { { let negated_scalar = scalar.twos_complement_negation(); self.unchecked_scalar_add_assign_async(ct, negated_scalar, streams); - ct.as_mut().info = ct.as_ref().info.after_scalar_sub(scalar); } pub fn unchecked_scalar_sub_assign(