zama-ai · agnesLeroy · Sep 12, 2024 · Sep 10, 2024
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -112,10 +112,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
                                uint32_t gpu_count, int8_t **mem_ptr_void);
 
-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus);
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus);
 
 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint32_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_output_indexes),
@@ -66,7 +66,7 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
     uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
     uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
 
-  host_packing_keyswitch_lwe_list_to_glwe(
+  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(glwe_array_out),
       static_cast<uint64_t *>(lwe_array_in),

diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -177,8 +177,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
     int num_blocks = (n + block_size - 1) / block_size;
 
     // Launch the kernel
-    cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
-                                                                 n);
+    cuda_set_value_kernel<Torus>
+        <<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
     check_cuda_error(cudaGetLastError());
   }
 }

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -37,12 +37,12 @@ void host_resolve_signed_overflow(
       streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
       mem->params.big_lwe_dimension, 1);
 
-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, x, mem->params.big_lwe_dimension,
-                1);
-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, last_block_input_carry,
-                mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, x,
+                       mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, last_block_input_carry,
+                       mem->params.big_lwe_dimension, 1);
 
   host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
                                       last_block_inner_propagation,
@@ -94,25 +94,25 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
 
   // phase 1
   if (op == SIGNED_OPERATION::ADDITION) {
-    host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
+                         big_lwe_dimension, num_blocks);
   } else {
-    host_integer_radix_negation(
+    host_integer_radix_negation<Torus>(
         streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
         num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
-    host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
+                         big_lwe_dimension, num_blocks);
   }
 
   // phase 2
   for (uint j = 0; j < gpu_count; j++) {
     cuda_synchronize_stream(streams[j], gpu_indexes[j]);
   }
 
-  host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
-                              result, output_carry, input_carries,
-                              mem_ptr->scp_mem, bsks, ksks, num_blocks);
-  host_generate_last_block_inner_propagation(
+  host_propagate_single_carry<Torus>(
+      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
+      input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
+  host_generate_last_block_inner_propagation<Torus>(
       mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
       last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
       &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
@@ -126,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
   // phase 3
   auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
 
-  host_resolve_signed_overflow(
+  host_resolve_signed_overflow<Torus>(
       streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
       input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
   std::function<uint64_t(uint64_t)> predicate_lut_f =
       [](uint64_t x) -> uint64_t { return x == 1; };
 
-  scratch_cuda_integer_radix_cmux_kb(
+  scratch_cuda_integer_radix_cmux_kb<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
       lwe_ciphertext_count, params, allocate_gpu_memory);

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -27,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
     auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
     auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
 
-    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
-        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
-        params.message_modulus, 1);
+    device_pack_bivariate_blocks<Torus>
+        <<<num_blocks, num_threads, 0, streams[0]>>>(
+            lwe_array_out_block, predicate->lwe_indexes_in,
+            lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
+            params.big_lwe_dimension, params.message_modulus, 1);
     check_cuda_error(cudaGetLastError());
   }
 
@@ -57,13 +58,15 @@ __host__ void host_integer_radix_cmux_kb(
   }
 
   auto mem_true = mem_ptr->zero_if_true_buffer;
-  zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-              lwe_array_true, lwe_condition, mem_true,
-              mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+                     lwe_array_true, lwe_condition, mem_true,
+                     mem_ptr->inverted_predicate_lut, bsks, ksks,
+                     num_radix_blocks);
   auto mem_false = mem_ptr->zero_if_false_buffer;
-  zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
-              lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
-              bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
+                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
+                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
+                     num_radix_blocks);
   for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
   }
@@ -75,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb(
   // will be 0 If the condition was false, true_ct will be 0 and false_ct will
   // have kept its value
   auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
-                mem_ptr->tmp_false_ct, params.big_lwe_dimension,
-                num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
+                       mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
+                       params.big_lwe_dimension, num_radix_blocks);
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,