diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
index c1d04edf40..3bb815acce 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -132,10 +132,11 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
 
 void cuda_integer_mult_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
-    void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
-    void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
-    uint32_t num_blocks);
+    CudaRadixCiphertextFFI *radix_lwe_out,
+    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
+    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
+    void *const *bsks, void *const *ksks, int8_t *mem_ptr,
+    uint32_t polynomial_size, uint32_t num_blocks);
 
 void cleanup_cuda_integer_mult(void *const *streams,
                                uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -375,9 +376,9 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix);
+    CudaRadixCiphertextFFI *radix_lwe_out,
+    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks);
 
 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -393,10 +394,10 @@ void scratch_cuda_integer_scalar_mul_kb_64(
 
 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint64_t const *decomposed_scalar,
+    CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
     uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars);
+    void *const *ksks, uint32_t polynomial_size, uint32_t message_modulus,
+    uint32_t num_scalars);
 
 void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
                                            uint32_t const *gpu_indexes,
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index e7745d5008..fe4224dc73 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -21,7 +21,7 @@ template <typename Torus>
 __global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
                                           uint32_t value, uint32_t blocks_count,
                                           uint32_t lwe_size);
-void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
+void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
                                  size_t *h_lwe_idx_out,
                                  int32_t *h_smart_copy_in,
                                  int32_t *h_smart_copy_out, size_t ch_amount,
@@ -1161,10 +1161,10 @@ template <typename Torus> struct int_overflowing_sub_memory {
 };
 
 template <typename Torus> struct int_sum_ciphertexts_vec_memory {
-  Torus *new_blocks;
-  Torus *new_blocks_copy;
-  Torus *old_blocks;
-  Torus *small_lwe_vector;
+  CudaRadixCiphertextFFI *new_blocks;
+  CudaRadixCiphertextFFI *new_blocks_copy;
+  CudaRadixCiphertextFFI *old_blocks;
+  CudaRadixCiphertextFFI *small_lwe_vector;
   int_radix_params params;
 
   int32_t *d_smart_copy_in;
@@ -1183,34 +1183,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
 
     // allocate gpu memory for intermediate buffers
-    new_blocks = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    new_blocks_copy = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    old_blocks = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    small_lwe_vector = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    cuda_memset_async(new_blocks, 0,
-                      max_pbs_count * (params.big_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
-    cuda_memset_async(new_blocks_copy, 0,
-                      max_pbs_count * (params.big_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
-    cuda_memset_async(old_blocks, 0,
-                      max_pbs_count * (params.big_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
-    cuda_memset_async(small_lwe_vector, 0,
-                      max_pbs_count * (params.small_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
+    new_blocks = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                              new_blocks, max_pbs_count,
+                                              params.big_lwe_dimension);
+    new_blocks_copy = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                              new_blocks_copy, max_pbs_count,
+                                              params.big_lwe_dimension);
+    old_blocks = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                              old_blocks, max_pbs_count,
+                                              params.big_lwe_dimension);
+    small_lwe_vector = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                              small_lwe_vector, max_pbs_count,
+                                              params.small_lwe_dimension);
 
     d_smart_copy_in = (int32_t *)cuda_malloc_async(
         max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
@@ -1227,8 +1215,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
                                  uint32_t gpu_count, int_radix_params params,
                                  uint32_t num_blocks_in_radix,
                                  uint32_t max_num_radix_in_vec,
-                                 Torus *new_blocks, Torus *old_blocks,
-                                 Torus *small_lwe_vector) {
+                                 CudaRadixCiphertextFFI *new_blocks,
+                                 CudaRadixCiphertextFFI *old_blocks,
+                                 CudaRadixCiphertextFFI *small_lwe_vector) {
     mem_reuse = true;
     this->params = params;
 
@@ -1238,13 +1227,10 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     this->new_blocks = new_blocks;
     this->old_blocks = old_blocks;
     this->small_lwe_vector = small_lwe_vector;
-    new_blocks_copy = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    cuda_memset_async(new_blocks_copy, 0,
-                      max_pbs_count * (params.big_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
+    new_blocks_copy = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                              new_blocks_copy, max_pbs_count,
+                                              params.big_lwe_dimension);
 
     d_smart_copy_in = (int32_t *)cuda_malloc_async(
         max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
@@ -1262,12 +1248,15 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]);
 
     if (!mem_reuse) {
-      cuda_drop_async(new_blocks, streams[0], gpu_indexes[0]);
-      cuda_drop_async(old_blocks, streams[0], gpu_indexes[0]);
-      cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
+      release_radix_ciphertext(streams[0], gpu_indexes[0], new_blocks);
+      delete new_blocks;
+      release_radix_ciphertext(streams[0], gpu_indexes[0], old_blocks);
+      delete old_blocks;
+      release_radix_ciphertext(streams[0], gpu_indexes[0], small_lwe_vector);
+      delete small_lwe_vector;
     }
-
-    cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext(streams[0], gpu_indexes[0], new_blocks_copy);
+    delete new_blocks_copy;
   }
 };
 // For sequential algorithm in group propagation
@@ -2482,7 +2471,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
 
   int_radix_params params;
 
-  Torus *tmp;
+  CudaRadixCiphertextFFI *tmp;
 
   cudaStream_t *true_streams;
   cudaStream_t *false_streams;
@@ -2495,10 +2484,11 @@ template <typename Torus> struct int_zero_out_if_buffer {
     this->params = params;
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
 
-    Torus big_size =
-        (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
     if (allocate_gpu_memory) {
-      tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
+      tmp = new CudaRadixCiphertextFFI;
+      create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
+                                                num_radix_blocks,
+                                                params.big_lwe_dimension);
       // We may use a different stream to allow concurrent operation
       true_streams =
           (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
@@ -2512,7 +2502,8 @@ template <typename Torus> struct int_zero_out_if_buffer {
   }
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
-    cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext(streams[0], gpu_indexes[0], tmp);
+    delete tmp;
     for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
       cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
@@ -2523,9 +2514,9 @@ template <typename Torus> struct int_zero_out_if_buffer {
 };
 
 template <typename Torus> struct int_mul_memory {
-  Torus *vector_result_sb;
-  Torus *block_mul_res;
-  Torus *small_lwe_vector;
+  CudaRadixCiphertextFFI *vector_result_sb;
+  CudaRadixCiphertextFFI *block_mul_res;
+  CudaRadixCiphertextFFI *small_lwe_vector;
 
   int_radix_lut<Torus> *luts_array; // lsb msb
   int_radix_lut<Torus> *zero_out_predicate_lut;
@@ -2574,7 +2565,6 @@ template <typename Torus> struct int_mul_memory {
     auto polynomial_size = params.polynomial_size;
     auto message_modulus = params.message_modulus;
     auto carry_modulus = params.carry_modulus;
-    auto lwe_dimension = params.small_lwe_dimension;
 
     // 'vector_result_lsb' contains blocks from all possible shifts of
     // radix_lwe_left excluding zero ciphertext blocks
@@ -2587,17 +2577,18 @@ template <typename Torus> struct int_mul_memory {
     int total_block_count = lsb_vector_block_count + msb_vector_block_count;
 
     // allocate memory for intermediate buffers
-    vector_result_sb = (Torus *)cuda_malloc_async(
-        2 * total_block_count * (polynomial_size * glwe_dimension + 1) *
-            sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    block_mul_res = (Torus *)cuda_malloc_async(
-        2 * total_block_count * (polynomial_size * glwe_dimension + 1) *
-            sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    small_lwe_vector = (Torus *)cuda_malloc_async(
-        total_block_count * (lwe_dimension + 1) * sizeof(Torus), streams[0],
-        gpu_indexes[0]);
+    vector_result_sb = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], vector_result_sb, 2 * total_block_count,
+        params.big_lwe_dimension);
+    block_mul_res = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], block_mul_res, 2 * total_block_count,
+        params.big_lwe_dimension);
+    small_lwe_vector = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
+        params.small_lwe_dimension);
 
     // create int_radix_lut objects for lsb, msb, message, carry
     // luts_array -> lut = {lsb_acc, msb_acc}
@@ -2658,9 +2649,12 @@ template <typename Torus> struct int_mul_memory {
 
       return;
     }
-    cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]);
-    cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]);
-    cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext(streams[0], gpu_indexes[0], vector_result_sb);
+    delete vector_result_sb;
+    release_radix_ciphertext(streams[0], gpu_indexes[0], block_mul_res);
+    delete block_mul_res;
+    release_radix_ciphertext(streams[0], gpu_indexes[0], small_lwe_vector);
+    delete small_lwe_vector;
 
     luts_array->release(streams, gpu_indexes, gpu_count);
     sum_ciphertexts_mem->release(streams, gpu_indexes, gpu_count);
@@ -4435,7 +4429,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
   int_radix_params params;
   int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
   int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_vec_mem;
-  Torus *preshifted_buffer;
+  CudaRadixCiphertextFFI *preshifted_buffer;
   CudaRadixCiphertextFFI *all_shifted_buffer;
   int_sc_prop_memory<Torus> *sc_prop_mem;
   bool anticipated_buffers_drop;
@@ -4450,25 +4444,21 @@ template <typename Torus> struct int_scalar_mul_buffer {
 
     if (allocate_gpu_memory) {
       uint32_t msg_bits = (uint32_t)std::log2(params.message_modulus);
-      uint32_t lwe_size = params.big_lwe_dimension + 1;
-      uint32_t lwe_size_bytes = lwe_size * sizeof(Torus);
       size_t num_ciphertext_bits = msg_bits * num_radix_blocks;
 
       //// Contains all shifted values of lhs for shift in range (0..msg_bits)
       //// The idea is that with these we can create all other shift that are
       /// in / range (0..total_bits) for free (block rotation)
-      preshifted_buffer = (Torus *)cuda_malloc_async(
-          num_ciphertext_bits * lwe_size_bytes, streams[0], gpu_indexes[0]);
+      preshifted_buffer = new CudaRadixCiphertextFFI;
+      create_zero_radix_ciphertext_async<Torus>(
+          streams[0], gpu_indexes[0], preshifted_buffer, num_ciphertext_bits,
+          params.big_lwe_dimension);
 
       all_shifted_buffer = new CudaRadixCiphertextFFI;
       create_zero_radix_ciphertext_async<Torus>(
           streams[0], gpu_indexes[0], all_shifted_buffer,
           num_ciphertext_bits * num_radix_blocks, params.big_lwe_dimension);
 
-      cuda_memset_async(preshifted_buffer, 0,
-                        num_ciphertext_bits * lwe_size_bytes, streams[0],
-                        gpu_indexes[0]);
-
       if (num_ciphertext_bits * num_radix_blocks >= num_radix_blocks + 2)
         logical_scalar_shift_buffer =
             new int_logical_scalar_shift_buffer<Torus>(
@@ -4500,7 +4490,8 @@ template <typename Torus> struct int_scalar_mul_buffer {
     release_radix_ciphertext(streams[0], gpu_indexes[0], all_shifted_buffer);
     delete all_shifted_buffer;
     if (!anticipated_buffers_drop) {
-      cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+      release_radix_ciphertext(streams[0], gpu_indexes[0], preshifted_buffer);
+      delete preshifted_buffer;
       logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
       delete (logical_scalar_shift_buffer);
     }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
index e5ca6f0ebf..b8ac78e306 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -7,11 +7,20 @@
 template <typename Torus>
 __host__ void zero_out_if(cudaStream_t const *streams,
                           uint32_t const *gpu_indexes, uint32_t gpu_count,
-                          Torus *lwe_array_out, Torus const *lwe_array_input,
-                          Torus const *lwe_condition,
+                          CudaRadixCiphertextFFI *lwe_array_out,
+                          CudaRadixCiphertextFFI const *lwe_array_input,
+                          CudaRadixCiphertextFFI const *lwe_condition,
                           int_zero_out_if_buffer<Torus> *mem_ptr,
                           int_radix_lut<Torus> *predicate, void *const *bsks,
                           Torus *const *ksks, uint32_t num_radix_blocks) {
+  if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
+      lwe_array_input->num_radix_blocks < num_radix_blocks)
+    PANIC("Cuda error: input or output radix ciphertexts does not have enough "
+          "blocks")
+  if (lwe_array_out->lwe_dimension != lwe_array_input->lwe_dimension ||
+      lwe_array_input->lwe_dimension != lwe_condition->lwe_dimension)
+    PANIC("Cuda error: input and output radix ciphertexts must have the same "
+          "lwe dimension")
   cuda_set_device(gpu_indexes[0]);
   auto params = mem_ptr->params;
 
@@ -21,12 +30,11 @@ __host__ void zero_out_if(cudaStream_t const *streams,
   host_pack_bivariate_blocks_with_single_block<Torus>(
       streams, gpu_indexes, gpu_count, tmp_lwe_array_input,
       predicate->lwe_indexes_in, lwe_array_input, lwe_condition,
-      predicate->lwe_indexes_in, params.big_lwe_dimension,
-      params.message_modulus, num_radix_blocks);
+      predicate->lwe_indexes_in, params.message_modulus, num_radix_blocks);
 
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
-      ksks, num_radix_blocks, predicate);
+      ksks, predicate, num_radix_blocks);
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index e03a691c6b..c2896b611c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -557,10 +557,20 @@ __global__ void device_pack_bivariate_blocks_with_single_block(
 template <typename Torus>
 __host__ void host_pack_bivariate_blocks_with_single_block(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_indexes_out,
-    Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in,
-    uint32_t lwe_dimension, uint32_t shift, uint32_t num_radix_blocks) {
+    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
+    Torus const *lwe_indexes_out, CudaRadixCiphertextFFI const *lwe_array_1,
+    CudaRadixCiphertextFFI const *lwe_2, Torus const *lwe_indexes_in,
+    uint32_t shift, uint32_t num_radix_blocks) {
 
+  if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
+      lwe_array_1->num_radix_blocks < num_radix_blocks)
+    PANIC("Cuda error: input or output radix ciphertexts does not have enough "
+          "blocks")
+  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
+      lwe_array_1->lwe_dimension != lwe_2->lwe_dimension)
+    PANIC("Cuda error: input and output radix ciphertexts must have the same "
+          "lwe dimension")
+  auto lwe_dimension = lwe_array_out->lwe_dimension;
   cuda_set_device(gpu_indexes[0]);
   // Left message is shifted
   int num_blocks = 0, num_threads = 0;
@@ -568,7 +578,8 @@ __host__ void host_pack_bivariate_blocks_with_single_block(
   getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
   device_pack_bivariate_blocks_with_single_block<Torus>
       <<<num_blocks, num_threads, 0, streams[0]>>>(
-          lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_2, lwe_indexes_in,
+          (Torus *)lwe_array_out->ptr, lwe_indexes_out,
+          (Torus *)lwe_array_1->ptr, (Torus *)lwe_2->ptr, lwe_indexes_in,
           lwe_dimension, shift, num_radix_blocks);
   check_cuda_error(cudaGetLastError());
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
index 0733a2ad34..56e28b8211 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -13,7 +13,7 @@
  * as output ids, -1 value as an input id means that zero ciphertext will be
  * copied on output index.
  */
-void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
+void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
                                  size_t *h_lwe_idx_out,
                                  int32_t *h_smart_copy_in,
                                  int32_t *h_smart_copy_out, size_t ch_amount,
@@ -127,66 +127,53 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
  */
 void cuda_integer_mult_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
-    void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
-    void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
-    uint32_t num_blocks) {
+    CudaRadixCiphertextFFI *radix_lwe_out,
+    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
+    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
+    void *const *bsks, void *const *ksks, int8_t *mem_ptr,
+    uint32_t polynomial_size, uint32_t num_blocks) {
 
   switch (polynomial_size) {
   case 256:
     host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
-        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
         (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
     break;
   case 512:
     host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
-        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
         (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
     break;
   case 1024:
     host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
-        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
         (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
     break;
   case 2048:
     host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
-        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
         (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
     break;
   case 4096:
     host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
-        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
         (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
     break;
   case 8192:
     host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
-        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
         (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
     break;
   case 16384:
     host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
-        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
         (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
     break;
   default:
@@ -226,79 +213,70 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix) {
+    CudaRadixCiphertextFFI *radix_lwe_out,
+    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks) {
 
   auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
 
-  int *terms_degree =
-      (int *)malloc(num_blocks_in_radix * num_radix_in_vec * sizeof(int));
-
-  for (int i = 0; i < num_radix_in_vec * num_blocks_in_radix; i++) {
-    terms_degree[i] = mem->params.message_modulus - 1;
-  }
-
   switch (mem->params.polynomial_size) {
   case 512:
     host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
         nullptr);
     break;
   case 1024:
     host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                 AmortizedDegree<1024>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
         nullptr);
     break;
   case 2048:
     host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                 AmortizedDegree<2048>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
         nullptr);
     break;
   case 4096:
     host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                 AmortizedDegree<4096>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
         nullptr);
     break;
   case 8192:
     host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                 AmortizedDegree<8192>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
         nullptr);
     break;
   case 16384:
     host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                 AmortizedDegree<16384>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
         nullptr);
     break;
   default:
     PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
           "Supported N's are powers of two in the interval [256..16384].")
   }
-
-  free(terms_degree);
 }
 
 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 3f32aa77b2..5d443fb296 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -126,9 +126,8 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
 template <typename Torus, class params>
 __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
                                         Torus *msb_blocks,
-                                        uint32_t glwe_dimension,
+                                        uint32_t big_lwe_size,
                                         uint32_t num_blocks) {
-  size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
   size_t big_lwe_id = blockIdx.x;
   size_t radix_id = big_lwe_id / num_blocks;
   size_t block_id = big_lwe_id % num_blocks;
@@ -138,10 +137,9 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
   bool process_lsb = (radix_id <= block_id);
   bool process_msb = (radix_id + 1 <= block_id);
 
-  auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_dimension];
-  auto cur_res_msb_ct =
-      &result_blocks[num_blocks * num_blocks * big_lwe_dimension +
-                     big_lwe_id * big_lwe_dimension];
+  auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_size];
+  auto cur_res_msb_ct = &result_blocks[num_blocks * num_blocks * big_lwe_size +
+                                       big_lwe_id * big_lwe_size];
   Torus *cur_lsb_radix = &lsb_blocks[(2 * num_blocks - radix_id + 1) *
                                      radix_id / 2 * (params::degree + 1)];
   Torus *cur_msb_radix = (process_msb)
@@ -184,12 +182,20 @@ __host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
 template <typename Torus, class params>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *radix_lwe_out, Torus *terms, int *terms_degree,
-    void *const *bsks, uint64_t *const *ksks,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
+    CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
     int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
-    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
+    uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
     int_radix_lut<Torus> *reused_lut) {
 
+  if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
+    PANIC("Cuda error: output and input radix ciphertexts should have the same "
+          "lwe dimension")
+  if (num_radix_in_vec >
+      terms->num_radix_blocks / radix_lwe_out->num_radix_blocks)
+    PANIC("Cuda error: input vector does not have enough blocks")
+  if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
+    PANIC("Cuda error: output does not have enough blocks")
   auto new_blocks = mem_ptr->new_blocks;
   auto new_blocks_copy = mem_ptr->new_blocks_copy;
   auto old_blocks = mem_ptr->old_blocks;
@@ -200,7 +206,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
 
   auto message_modulus = mem_ptr->params.message_modulus;
   auto carry_modulus = mem_ptr->params.carry_modulus;
-  auto num_blocks = num_blocks_in_radix;
   auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
   auto big_lwe_size = big_lwe_dimension + 1;
   auto glwe_dimension = mem_ptr->params.glwe_dimension;
@@ -212,25 +217,23 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
   uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
 
-  if (num_radix_in_vec == 0)
+  if (terms->num_radix_blocks == 0)
     return;
-  if (num_radix_in_vec == 1) {
-    cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms,
-                                 num_blocks_in_radix * big_lwe_size *
-                                     sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
+  if (terms->num_radix_blocks == radix_lwe_out->num_radix_blocks) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                       radix_lwe_out, terms);
     return;
   }
   if (old_blocks != terms) {
-    cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
-                                 num_blocks_in_radix * num_radix_in_vec *
-                                     big_lwe_size * sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
+                                       terms);
   }
   if (num_radix_in_vec == 2) {
-    legacy_host_addition<Torus>(
-        streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-        &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks);
+    CudaRadixCiphertextFFI old_blocks_slice;
+    as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+                                     num_radix_blocks, 2 * num_radix_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks_slice, num_radix_blocks);
     return;
   }
 
@@ -239,10 +242,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
   size_t message_max = message_modulus - 1;
   size_t chunk_size = (total_modulus - 1) / message_max;
 
-  size_t h_lwe_idx_in[r * num_blocks];
-  size_t h_lwe_idx_out[r * num_blocks];
-  int32_t h_smart_copy_in[r * num_blocks];
-  int32_t h_smart_copy_out[r * num_blocks];
+  size_t h_lwe_idx_in[terms->num_radix_blocks];
+  size_t h_lwe_idx_out[terms->num_radix_blocks];
+  int32_t h_smart_copy_in[terms->num_radix_blocks];
+  int32_t h_smart_copy_out[terms->num_radix_blocks];
 
   /// Here it is important to query the default max shared memory on device 0
   /// instead of cuda_get_max_shared_memory,
@@ -261,11 +264,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
   if (reused_lut == nullptr) {
     luts_message_carry = new int_radix_lut<Torus>(
         streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
-        2 * ch_amount * num_blocks, true);
+        2 * ch_amount * num_radix_blocks, true);
   } else {
     luts_message_carry = new int_radix_lut<Torus>(
         streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
-        2 * ch_amount * num_blocks, reused_lut);
+        2 * ch_amount * num_radix_blocks, reused_lut);
   }
   auto message_acc = luts_message_carry->get_lut(0, 0);
   auto carry_acc = luts_message_carry->get_lut(0, 1);
@@ -291,15 +294,16 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
   luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
 
   while (r > 2) {
-    size_t cur_total_blocks = r * num_blocks;
+    size_t cur_total_blocks = r * num_radix_blocks;
     size_t ch_amount = r / chunk_size;
     if (!ch_amount)
       ch_amount++;
-    dim3 add_grid(ch_amount, num_blocks, 1);
+    dim3 add_grid(ch_amount, num_radix_blocks, 1);
 
     cuda_set_device(gpu_indexes[0]);
     tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
-        new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
+        (Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr, min(r, chunk_size),
+        big_lwe_size, num_radix_blocks);
 
     check_cuda_error(cudaGetLastError());
 
@@ -309,9 +313,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
     size_t sm_copy_count = 0;
 
     generate_ids_update_degrees(
-        terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
-        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
-        total_count, message_count, carry_count, sm_copy_count);
+        terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
+        h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
+        message_max, total_count, message_count, carry_count, sm_copy_count);
     auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
     auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
     luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
@@ -326,12 +330,12 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
     // inside d_smart_copy_in there are only -1 values
     // it's fine to call smart_copy with same pointer
     // as source and destination
-    cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks,
-                                 r * num_blocks * big_lwe_size * sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
+        new_blocks, 0, r * num_radix_blocks);
     smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
-        new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in,
-        big_lwe_size);
+        (Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
+        d_smart_copy_out, d_smart_copy_in, big_lwe_size);
     check_cuda_error(cudaGetLastError());
 
     if (carry_count > 0)
@@ -359,27 +363,27 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
       /// because the keyswitch and PBS do not operate on the same number of
       /// inputs
       execute_keyswitch_async<Torus>(
-          streams, gpu_indexes, 1, small_lwe_vector, lwe_indexes_in, new_blocks,
-          lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
-          small_lwe_dimension, mem_ptr->params.ks_base_log,
-          mem_ptr->params.ks_level, message_count);
+          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
+          lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
+          polynomial_size * glwe_dimension, small_lwe_dimension,
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
 
       /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
       /// dimension to a big LWE dimension
       execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, new_blocks, lwe_indexes_out,
+          streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
           luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
-          small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
-          glwe_dimension, small_lwe_dimension, polynomial_size,
-          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+          (Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
+          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
+          polynomial_size, mem_ptr->params.pbs_base_log,
+          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
     } else {
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
       multi_gpu_scatter_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
-          luts_message_carry->h_lwe_indexes_in,
+          streams, gpu_indexes, active_gpu_count, new_blocks_vec,
+          (Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
           luts_message_carry->using_trivial_lwe_indexes, message_count,
           big_lwe_size);
 
@@ -396,7 +400,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
       /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
       /// different configuration
       multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, gpu_count, small_lwe_vector,
+          streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
           small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
           luts_message_carry->using_trivial_lwe_indexes, message_count,
           small_lwe_size);
@@ -407,7 +411,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
 
       multi_gpu_scatter_lwe_async<Torus>(
           streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
-          small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
+          (Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
           luts_message_carry->using_trivial_lwe_indexes, total_count,
           small_lwe_size);
 
@@ -424,8 +428,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
           mem_ptr->params.pbs_type, num_many_lut, lut_stride);
 
       multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
-          luts_message_carry->h_lwe_indexes_out,
+          streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
+          lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
           luts_message_carry->using_trivial_lwe_indexes, total_count,
           big_lwe_size);
       /// Synchronize all GPUs
@@ -433,41 +437,59 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
         cuda_synchronize_stream(streams[i], gpu_indexes[i]);
       }
     }
+    Torus lut_indexes[luts_message_carry->num_blocks];
+    cuda_memcpy_async_to_cpu(&lut_indexes,
+                             luts_message_carry->get_lut_indexes(0, 0),
+                             luts_message_carry->num_blocks * sizeof(Torus),
+                             streams[0], gpu_indexes[0]);
+    for (uint i = 0; i < total_count; i++) {
+      new_blocks->degrees[i] = luts_message_carry->degrees[lut_indexes[i]];
+      new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
+    }
 
-    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
-    int new_blocks_created = 2 * ch_amount * num_blocks;
-    copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
+    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
+    int new_blocks_created = 2 * ch_amount * num_radix_blocks;
 
-    auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
-    auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
-    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, streams[0],
-                                 gpu_indexes[0]);
+    if (rem_blocks > 0)
+      copy_radix_ciphertext_slice_async<Torus>(
+          streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
+          new_blocks_created + rem_blocks, old_blocks,
+          cur_total_blocks - rem_blocks, cur_total_blocks);
     std::swap(new_blocks, old_blocks);
-    r = (new_blocks_created + rem_blocks) / num_blocks;
+    r = (new_blocks_created + rem_blocks) / num_radix_blocks;
   }
   luts_message_carry->release(streams, gpu_indexes, gpu_count);
   delete (luts_message_carry);
 
-  legacy_host_addition<Torus>(
-      streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-      &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks);
+  CudaRadixCiphertextFFI old_blocks_slice;
+  as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+                                   num_radix_blocks, 2 * num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks_slice, num_radix_blocks);
 }
 
 template <typename Torus, class params>
 __host__ void host_integer_mult_radix_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left,
-    bool const is_bool_left, uint64_t const *radix_lwe_right,
-    bool const is_bool_right, void *const *bsks, uint64_t *const *ksks,
-    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
-
-  auto glwe_dimension = mem_ptr->params.glwe_dimension;
-  auto polynomial_size = mem_ptr->params.polynomial_size;
+    uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
+    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
+    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
+    void *const *bsks, uint64_t *const *ksks, int_mul_memory<Torus> *mem_ptr,
+    uint32_t num_blocks) {
+
+  if (radix_lwe_out->lwe_dimension != radix_lwe_left->lwe_dimension ||
+      radix_lwe_right->lwe_dimension != radix_lwe_left->lwe_dimension)
+    PANIC("Cuda error: input and output lwe dimensions should be the same")
+  if (radix_lwe_out->num_radix_blocks < num_blocks ||
+      radix_lwe_left->num_radix_blocks < num_blocks ||
+      radix_lwe_right->num_radix_blocks < num_blocks)
+    PANIC("Cuda error: input or output does not have enough radix blocks")
   auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
   auto message_modulus = mem_ptr->params.message_modulus;
   auto carry_modulus = mem_ptr->params.carry_modulus;
 
-  int big_lwe_dimension = glwe_dimension * polynomial_size;
+  int big_lwe_dimension = radix_lwe_left->lwe_dimension;
+  int big_lwe_size = big_lwe_dimension + 1;
 
   if (is_bool_right) {
     zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
@@ -529,57 +551,55 @@ __host__ void host_integer_mult_radix_kb(
   // 2 * (glwe_dimension + 1) * polynomial_size
   auto luts_array = mem_ptr->luts_array;
 
-  auto vector_result_lsb = &vector_result_sb[0];
-  auto vector_result_msb =
-      &vector_result_sb[lsb_vector_block_count *
-                        (polynomial_size * glwe_dimension + 1)];
+  auto vector_result_lsb = vector_result_sb;
+  CudaRadixCiphertextFFI vector_result_msb;
+  as_radix_ciphertext_slice<Torus>(&vector_result_msb, vector_result_lsb,
+                                   lsb_vector_block_count,
+                                   vector_result_lsb->num_radix_blocks);
 
-  auto vector_lsb_rhs = &block_mul_res[0];
-  auto vector_msb_rhs = &block_mul_res[lsb_vector_block_count *
-                                       (polynomial_size * glwe_dimension + 1)];
+  auto vector_lsb_rhs = block_mul_res;
+  CudaRadixCiphertextFFI vector_msb_rhs;
+  as_radix_ciphertext_slice<Torus>(&vector_msb_rhs, block_mul_res,
+                                   lsb_vector_block_count,
+                                   block_mul_res->num_radix_blocks);
 
   dim3 grid(lsb_vector_block_count, 1, 1);
   dim3 thds(params::degree / params::opt, 1, 1);
 
   cuda_set_device(gpu_indexes[0]);
   all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
-      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
-      vector_lsb_rhs, vector_msb_rhs, num_blocks);
+      (Torus *)radix_lwe_left->ptr, (Torus *)vector_result_lsb->ptr,
+      (Torus *)vector_result_msb.ptr, (Torus *)radix_lwe_right->ptr,
+      (Torus *)vector_lsb_rhs->ptr, (Torus *)vector_msb_rhs.ptr, num_blocks);
   check_cuda_error(cudaGetLastError());
 
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
-      vector_result_sb, bsks, ksks, total_block_count, luts_array,
+      vector_result_sb, bsks, ksks, luts_array, total_block_count,
       luts_array->params.message_modulus);
 
-  vector_result_lsb = &block_mul_res[0];
-  vector_result_msb = &block_mul_res[lsb_vector_block_count *
-                                     (polynomial_size * glwe_dimension + 1)];
+  vector_result_lsb = block_mul_res;
+  as_radix_ciphertext_slice<Torus>(&vector_result_msb, block_mul_res,
+                                   lsb_vector_block_count,
+                                   block_mul_res->num_radix_blocks);
 
   cuda_set_device(gpu_indexes[0]);
   fill_radix_from_lsb_msb<Torus, params>
       <<<num_blocks * num_blocks, params::degree / params::opt, 0,
-         streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
-                       glwe_dimension, num_blocks);
+         streams[0]>>>(
+          (Torus *)vector_result_sb->ptr, (Torus *)vector_result_lsb->ptr,
+          (Torus *)vector_result_msb.ptr, big_lwe_size, num_blocks);
   check_cuda_error(cudaGetLastError());
 
-  int terms_degree[2 * num_blocks * num_blocks];
   for (int i = 0; i < num_blocks * num_blocks; i++) {
     size_t r_id = i / num_blocks;
     size_t b_id = i % num_blocks;
-    terms_degree[i] = (b_id >= r_id) ? message_modulus - 1 : 0;
+    vector_result_sb->degrees[i] = (b_id >= r_id) ? message_modulus - 1 : 0;
   }
-  auto terms_degree_msb = &terms_degree[num_blocks * num_blocks];
-  for (int i = 0; i < num_blocks * num_blocks; i++) {
-    size_t r_id = i / num_blocks;
-    size_t b_id = i % num_blocks;
-    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
-  }
-
   host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
-      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
-      terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
-      2 * num_blocks, mem_ptr->luts_array);
+      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
+      ksks, mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks,
+      mem_ptr->luts_array);
 
   uint32_t block_modulus = message_modulus * carry_modulus;
   uint32_t num_bits_in_block = log2_int(block_modulus);
@@ -587,9 +607,9 @@ __host__ void host_integer_mult_radix_kb(
   auto scp_mem_ptr = mem_ptr->sc_prop_mem;
   uint32_t requested_flag = outputFlag::FLAG_NONE;
   uint32_t uses_carry = 0;
-  legacy_host_propagate_single_carry<Torus>(
+  host_propagate_single_carry<Torus>(
       streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
-      scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
+      scp_mem_ptr, bsks, ksks, requested_flag, uses_carry);
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
index fc553578ed..5f00a67b5d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -139,8 +139,8 @@ void set_zero_radix_ciphertext_slice_async(cudaStream_t const stream,
   if (radix->num_radix_blocks < end_lwe_index - start_lwe_index)
     PANIC("Cuda error: input radix should have more blocks than the specified "
           "range")
-  if (start_lwe_index >= end_lwe_index)
-    PANIC("Cuda error: slice range should be non negative")
+  if (start_lwe_index > end_lwe_index)
+    PANIC("Cuda error: slice range should be positive")
 
   auto lwe_size = radix->lwe_dimension + 1;
   auto num_blocks_to_set = end_lwe_index - start_lwe_index;
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
index 7cbfd566d9..04ce8f164c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -21,65 +21,53 @@ void scratch_cuda_integer_scalar_mul_kb_64(
 
 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint64_t const *decomposed_scalar,
+    CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
     uint64_t const *has_at_least_one_set, int8_t *mem, void *const *bsks,
-    void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) {
+    void *const *ksks, uint32_t polynomial_size, uint32_t message_modulus,
+    uint32_t num_scalars) {
 
   switch (polynomial_size) {
   case 512:
     host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
         reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
-        num_scalars);
+        (uint64_t **)(ksks), message_modulus, num_scalars);
     break;
   case 1024:
     host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
         reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
-        num_scalars);
+        (uint64_t **)(ksks), message_modulus, num_scalars);
     break;
   case 2048:
     host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
         reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
-        num_scalars);
+        (uint64_t **)(ksks), message_modulus, num_scalars);
     break;
   case 4096:
     host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
         reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
-        num_scalars);
+        (uint64_t **)(ksks), message_modulus, num_scalars);
     break;
   case 8192:
     host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
         reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
-        num_scalars);
+        (uint64_t **)(ksks), message_modulus, num_scalars);
     break;
   case 16384:
     host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
         reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
-        num_scalars);
+        (uint64_t **)(ksks), message_modulus, num_scalars);
     break;
   default:
     PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
index f801e8bb29..d18f158377 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -42,81 +42,86 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
 template <typename T, class params>
 __host__ void host_integer_scalar_mul_radix(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, T *lwe_array, T const *decomposed_scalar,
-    T const *has_at_least_one_set, int_scalar_mul_buffer<T> *mem,
-    void *const *bsks, T *const *ksks, uint32_t input_lwe_dimension,
-    uint32_t message_modulus, uint32_t num_radix_blocks, uint32_t num_scalars) {
+    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
+    T const *decomposed_scalar, T const *has_at_least_one_set,
+    int_scalar_mul_buffer<T> *mem, void *const *bsks, T *const *ksks,
+    uint32_t message_modulus, uint32_t num_scalars) {
 
+  auto num_radix_blocks = lwe_array->num_radix_blocks;
   // lwe_size includes the presence of the body
   // whereas lwe_dimension is the number of elements in the mask
-  uint32_t lwe_size = input_lwe_dimension + 1;
-  uint32_t lwe_size_bytes = lwe_size * sizeof(T);
   uint32_t msg_bits = log2_int(message_modulus);
   uint32_t num_ciphertext_bits = msg_bits * num_radix_blocks;
 
-  T *preshifted_buffer = mem->preshifted_buffer;
-  T *all_shifted_buffer = (T *)mem->all_shifted_buffer->ptr;
+  auto preshifted_buffer = mem->preshifted_buffer;
+  auto all_shifted_buffer = mem->all_shifted_buffer;
 
   for (size_t shift_amount = 0; shift_amount < msg_bits; shift_amount++) {
-    T *ptr = preshifted_buffer + shift_amount * lwe_size * num_radix_blocks;
+    CudaRadixCiphertextFFI shift_input;
+    as_radix_ciphertext_slice<T>(&shift_input, preshifted_buffer,
+                                 shift_amount * num_radix_blocks,
+                                 preshifted_buffer->num_radix_blocks);
     if (has_at_least_one_set[shift_amount] == 1) {
-      cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
-                                   lwe_size_bytes * num_radix_blocks,
-                                   streams[0], gpu_indexes[0]);
-      legacy_host_integer_radix_logical_scalar_shift_kb_inplace<T>(
-          streams, gpu_indexes, gpu_count, ptr, shift_amount,
+      copy_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
+                                           &shift_input, 0, num_radix_blocks,
+                                           lwe_array, 0, num_radix_blocks);
+      host_integer_radix_logical_scalar_shift_kb_inplace<T>(
+          streams, gpu_indexes, gpu_count, &shift_input, shift_amount,
           mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
     } else {
       // create trivial assign for value = 0
-      cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, streams[0],
-                        gpu_indexes[0]);
+      set_zero_radix_ciphertext_slice_async<T>(
+          streams[0], gpu_indexes[0], &shift_input, 0, num_radix_blocks);
     }
   }
   size_t j = 0;
   for (size_t i = 0; i < min(num_scalars, num_ciphertext_bits); i++) {
     if (decomposed_scalar[i] == 1) {
       // Perform a block shift
-      T *preshifted_radix_ct =
-          preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
-      T *block_shift_buffer =
-          all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      legacy_host_radix_blocks_rotate_right<T>(
-          streams, gpu_indexes, gpu_count, block_shift_buffer,
-          preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
+      CudaRadixCiphertextFFI preshifted_radix_ct;
+      as_radix_ciphertext_slice<T>(&preshifted_radix_ct, preshifted_buffer,
+                                   (i % msg_bits) * num_radix_blocks,
+                                   preshifted_buffer->num_radix_blocks);
+      CudaRadixCiphertextFFI block_shift_buffer;
+      as_radix_ciphertext_slice<T>(&block_shift_buffer, all_shifted_buffer,
+                                   j * num_radix_blocks,
+                                   all_shifted_buffer->num_radix_blocks);
+      host_radix_blocks_rotate_right<T>(
+          streams, gpu_indexes, gpu_count, &block_shift_buffer,
+          &preshifted_radix_ct, i / msg_bits, num_radix_blocks);
       // create trivial assign for value = 0
-      cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
-                        streams[0], gpu_indexes[0]);
+      set_zero_radix_ciphertext_slice_async<T>(
+          streams[0], gpu_indexes[0], &block_shift_buffer, 0, i / msg_bits);
       j++;
     }
   }
   cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
   if (mem->anticipated_buffers_drop) {
-    cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext(streams[0], gpu_indexes[0], preshifted_buffer);
+    delete preshifted_buffer;
     mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
     delete (mem->logical_scalar_shift_buffer);
   }
 
   if (j == 0) {
     // lwe array = 0
-    cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes,
-                      streams[0], gpu_indexes[0]);
+    set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
+                                             lwe_array, 0, num_radix_blocks);
   } else {
-    int terms_degree[j * num_radix_blocks];
     for (int i = 0; i < j * num_radix_blocks; i++) {
-      terms_degree[i] = message_modulus - 1;
+      all_shifted_buffer->degrees[i] = message_modulus - 1;
     }
     host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
-        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
-        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j, nullptr);
+        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
+        ksks, mem->sum_ciphertexts_vec_mem, num_radix_blocks, j, nullptr);
 
     auto scp_mem_ptr = mem->sc_prop_mem;
     uint32_t requested_flag = outputFlag::FLAG_NONE;
     uint32_t uses_carry = 0;
-    legacy_host_propagate_single_carry<T>(
-        streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
-        scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, uses_carry);
+    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
+                                   nullptr, nullptr, scp_mem_ptr, bsks, ksks,
+                                   requested_flag, uses_carry);
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
index 6ca372bae3..ac58cab4d8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -32,7 +32,7 @@ void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
   host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift,
       (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (uint64_t **)(ksks), lwe_array->num_radix_blocks);
 }
 
 void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
index 6676a6c55d..974442b571 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -119,9 +119,10 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
     int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
+    Torus *const *ksks, uint32_t num_blocks) {
 
-  auto num_blocks = lwe_array->num_radix_blocks;
+  if (lwe_array->num_radix_blocks < num_blocks)
+    PANIC("Cuda error: input does not have enough blocks")
   auto params = mem->params;
   auto message_modulus = params.message_modulus;
 
@@ -147,9 +148,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
                                           num_blocks);
 
     // create trivial assign for value = 0
-    if (rotations > 0)
-      set_zero_radix_ciphertext_slice_async<Torus>(
-          streams[0], gpu_indexes[0], &rotated_buffer, 0, rotations);
+    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                                 &rotated_buffer, 0, rotations);
     copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
                                              lwe_array, 0, num_blocks,
                                              &rotated_buffer, 0, num_blocks);
@@ -183,11 +183,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
 
     // rotate left as the blocks are from LSB to MSB
     // create trivial assign for value = 0
-    if (rotations > 0) {
-      set_zero_radix_ciphertext_slice_async<Torus>(
-          streams[0], gpu_indexes[0], &rotated_buffer, num_blocks - rotations,
-          num_blocks);
-    }
+    set_zero_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], &rotated_buffer, num_blocks - rotations,
+        num_blocks);
     copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
                                              lwe_array, 0, num_blocks,
                                              &rotated_buffer, 0, num_blocks);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
index 52f7ef44eb..d7e8f48fde 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -110,9 +110,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
                                             rotated_input, input_bits_b,
                                             rotations, total_nb_bits);
 
-      if (rotations > 0)
-        set_zero_radix_ciphertext_slice_async<Torus>(
-            streams[0], gpu_indexes[0], rotated_input, 0, rotations);
+      set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                                   rotated_input, 0, rotations);
       break;
     case RIGHT_SHIFT:
       // rotate left as the blocks are from LSB to MSB
@@ -130,10 +129,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
               &last_bit, 0, 1);
         }
       else {
-        if (rotations > 0)
-          set_zero_radix_ciphertext_slice_async<Torus>(
-              streams[0], gpu_indexes[0], rotated_input,
-              total_nb_bits - rotations, total_nb_bits);
+        set_zero_radix_ciphertext_slice_async<Torus>(
+            streams[0], gpu_indexes[0], rotated_input,
+            total_nb_bits - rotations, total_nb_bits);
       }
       break;
     case LEFT_ROTATE:
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index 8cdc924a41..8615b97721 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -362,10 +362,10 @@ unsafe extern "C" {
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe_out: *mut ffi::c_void,
-        radix_lwe_left: *const ffi::c_void,
+        radix_lwe_out: *mut CudaRadixCiphertextFFI,
+        radix_lwe_left: *const CudaRadixCiphertextFFI,
         is_bool_left: bool,
-        radix_lwe_right: *const ffi::c_void,
+        radix_lwe_right: *const CudaRadixCiphertextFFI,
         is_bool_right: bool,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
@@ -909,13 +909,11 @@ unsafe extern "C" {
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe_out: *mut ffi::c_void,
-        radix_lwe_vec: *mut ffi::c_void,
-        num_radix_in_vec: u32,
+        radix_lwe_out: *mut CudaRadixCiphertextFFI,
+        radix_lwe_vec: *mut CudaRadixCiphertextFFI,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        num_blocks_in_radix: u32,
     );
 }
 unsafe extern "C" {
@@ -952,16 +950,14 @@ unsafe extern "C" {
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_array: *mut ffi::c_void,
+        lwe_array: *mut CudaRadixCiphertextFFI,
         decomposed_scalar: *const u64,
         has_at_least_one_set: *const u64,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        lwe_dimension: u32,
         polynomial_size: u32,
         message_modulus: u32,
-        num_blocks: u32,
         num_scalars: u32,
     );
 }
diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs
index cc36e6f7ce..e3090fd8c5 100644
--- a/tfhe/src/integer/gpu/ciphertext/info.rs
+++ b/tfhe/src/integer/gpu/ciphertext/info.rs
@@ -1,5 +1,3 @@
-use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
-use crate::integer::server_key::TwosComplementNegation;
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::{CarryModulus, MessageModulus, PBSOrder};
 
@@ -24,84 +22,6 @@ pub struct CudaRadixCiphertextInfo {
 }
 
 impl CudaRadixCiphertextInfo {
-    // Creates an iterator that return decomposed blocks of the negated
-    // value of `scalar`
-    //
-    // Returns
-    // - `None` if scalar is zero
-    // - `Some` if scalar is non-zero
-    //
-    fn create_negated_block_decomposer<T>(&self, scalar: T) -> Option<impl Iterator<Item = u8>>
-    where
-        T: TwosComplementNegation + DecomposableInto<u8>,
-    {
-        if scalar == T::ZERO {
-            return None;
-        }
-        let message_modulus = self.blocks.first().unwrap().message_modulus;
-        let bits_in_message = message_modulus.0.ilog2();
-        assert!(bits_in_message <= u8::BITS);
-
-        // The whole idea behind this iterator we construct is:
-        // - to support combos of parameters and num blocks for which the total number of bits is
-        //   not a multiple of T::BITS
-        //
-        // - Support subtraction in the case the T::BITS is lower than the target ciphertext bits.
-        //   In clear rust this would require an upcast, to support that we have to do a few things
-
-        let neg_scalar = scalar.twos_complement_negation();
-
-        // If we had upcasted the scalar, its msb would be zeros (0)
-        // then they would become ones (1) after the bitwise_not (!).
-        // The only case where these msb could become 0 after the addition
-        // is if scalar == T::ZERO (=> !T::ZERO == T::MAX => T::MAX + 1 == overflow),
-        // but this case has been handled earlier.
-        let padding_bit = 1u32; // To handle when bits is not a multiple of T::BITS
-                                // All bits of message set to one
-        let pad_block = (1 << bits_in_message as u8) - 1;
-
-        let decomposer = BlockDecomposer::with_padding_bit(
-            neg_scalar,
-            bits_in_message,
-            T::cast_from(padding_bit),
-        )
-        .iter_as::<u8>()
-        .chain(std::iter::repeat(pad_block));
-        Some(decomposer)
-    }
-
-    pub(crate) fn after_mul(&self) -> Self {
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .map(|left| CudaBlockInfo {
-                    degree: Degree::new(left.message_modulus.0 - 1),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: NoiseLevel::NOMINAL,
-                })
-                .collect(),
-        }
-    }
-
-    pub(crate) fn after_ilog2(&self) -> Self {
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .map(|info| CudaBlockInfo {
-                    degree: Degree::new(info.message_modulus.0 - 1),
-                    message_modulus: info.message_modulus,
-                    carry_modulus: info.carry_modulus,
-                    pbs_order: info.pbs_order,
-                    noise_level: NoiseLevel::NOMINAL,
-                })
-                .collect(),
-        }
-    }
-
     pub(crate) fn after_div_rem(&self) -> Self {
         Self {
             blocks: self
@@ -149,47 +69,6 @@ impl CudaRadixCiphertextInfo {
         }
     }
 
-    pub(crate) fn after_scalar_mul(&self) -> Self {
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .map(|info| CudaBlockInfo {
-                    degree: Degree::new(info.message_modulus.0 - 1),
-                    message_modulus: info.message_modulus,
-                    carry_modulus: info.carry_modulus,
-                    pbs_order: info.pbs_order,
-                    noise_level: NoiseLevel::NOMINAL,
-                })
-                .collect(),
-        }
-    }
-
-    pub(crate) fn after_scalar_sub<T>(&self, scalar: T) -> Self
-    where
-        T: TwosComplementNegation + DecomposableInto<u8>,
-    {
-        let Some(decomposer) = self.create_negated_block_decomposer(scalar) else {
-            // subtraction by zero
-            return self.clone();
-        };
-
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .zip(decomposer)
-                .map(|(left, scalar_block)| CudaBlockInfo {
-                    degree: Degree::new(left.degree.get() + u64::from(scalar_block)),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: left.noise_level,
-                })
-                .collect(),
-        }
-    }
-
     pub(crate) fn after_bitnot(&self) -> Self {
         Self {
             blocks: self
diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs
index f304704e1e..453955c8e5 100644
--- a/tfhe/src/integer/gpu/ciphertext/mod.rs
+++ b/tfhe/src/integer/gpu/ciphertext/mod.rs
@@ -171,6 +171,23 @@ impl CudaRadixCiphertext {
             })
             .collect()
     }
+    pub fn from_radix_ciphertext_vec<T: CudaIntegerRadixCiphertext>(
+        list: &[T],
+        streams: &CudaStreams,
+    ) -> Self {
+        let lwes = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list(
+            list.iter().map(|ciphertext| &ciphertext.as_ref().d_blocks),
+            streams,
+        );
+        let info = CudaRadixCiphertextInfo {
+            blocks: list
+                .iter()
+                .flat_map(|ciphertext| ciphertext.as_ref().info.blocks.iter())
+                .copied()
+                .collect::<Vec<CudaBlockInfo>>(),
+        };
+        Self::new(lwes, info)
+    }
 }
 
 impl CudaUnsignedRadixCiphertext {
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index bfda9e34c6..4b9a12f5d5 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -279,7 +279,7 @@ pub unsafe fn scalar_addition_integer_radix_assign_async<T: UnsignedInteger>(
 ///   is required
 pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B: Numeric>(
     streams: &CudaStreams,
-    lwe_array: &mut CudaVec<T>,
+    lwe_array: &mut CudaRadixCiphertext,
     decomposed_scalar: &[T],
     has_at_least_one_set: &[T],
     bootstrapping_key: &CudaVec<B>,
@@ -293,14 +293,13 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
     pbs_level: DecompositionLevelCount,
     ks_base_log: DecompositionBaseLog,
     ks_level: DecompositionLevelCount,
-    num_blocks: u32,
     num_scalars: u32,
     pbs_type: PBSType,
     grouping_factor: LweBskGroupingFactor,
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        lwe_array.gpu_index(0),
+        lwe_array.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -314,6 +313,18 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
         "GPU error: all data should reside on the same GPU."
     );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut lwe_array_degrees = lwe_array.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut lwe_array_noise_levels = lwe_array
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_lwe_array = prepare_cuda_radix_ffi(
+        lwe_array,
+        &mut lwe_array_degrees,
+        &mut lwe_array_noise_levels,
+    );
     scratch_cuda_integer_scalar_mul_kb_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
@@ -327,7 +338,7 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
         pbs_level.0 as u32,
         pbs_base_log.0 as u32,
         grouping_factor.0 as u32,
-        num_blocks,
+        lwe_array.d_blocks.0.lwe_ciphertext_count.0 as u32,
         message_modulus.0 as u32,
         carry_modulus.0 as u32,
         pbs_type as u32,
@@ -338,16 +349,14 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
         streams.len() as u32,
-        lwe_array.as_mut_c_ptr(0),
+        &mut cuda_ffi_lwe_array,
         decomposed_scalar.as_ptr().cast::<u64>(),
         has_at_least_one_set.as_ptr().cast::<u64>(),
         mem_ptr,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
-        (glwe_dimension.0 * polynomial_size.0) as u32,
         polynomial_size.0 as u32,
         message_modulus.0 as u32,
-        num_blocks,
         num_scalars,
     );
 
@@ -357,6 +366,7 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
     );
+    update_noise_degree(lwe_array, &cuda_ffi_lwe_array);
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -592,9 +602,9 @@ pub unsafe fn unchecked_add_integer_radix_assign_async(
 ///   is required
 pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B: Numeric>(
     streams: &CudaStreams,
-    radix_lwe_left: &mut CudaVec<T>,
+    radix_lwe_left: &mut CudaRadixCiphertext,
     is_boolean_left: bool,
-    radix_lwe_right: &CudaVec<T>,
+    radix_lwe_right: &CudaRadixCiphertext,
     is_boolean_right: bool,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
@@ -613,12 +623,12 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_left.gpu_index(0),
+        radix_lwe_left.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_right.gpu_index(0),
+        radix_lwe_right.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -632,6 +642,46 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
         "GPU error: all data should reside on the same GPU."
     );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut radix_lwe_left_degrees = radix_lwe_left
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.0)
+        .collect();
+    let mut radix_lwe_left_noise_levels = radix_lwe_left
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_radix_lwe_left = prepare_cuda_radix_ffi(
+        radix_lwe_left,
+        &mut radix_lwe_left_degrees,
+        &mut radix_lwe_left_noise_levels,
+    );
+    // Here even though the input is not modified, data is passed as mutable.
+    // This avoids having to create two structs for the CudaRadixCiphertext pointers,
+    // one const and the other mutable.
+    // Having two structs on the Cuda side complicates things as we need to be sure we pass the
+    // Const structure as input instead of the mutable structure, which leads to complicated
+    // data manipulation on the C++ side to change mutability of data.
+    let mut radix_lwe_right_degrees = radix_lwe_right
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.0)
+        .collect();
+    let mut radix_lwe_right_noise_levels = radix_lwe_right
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let cuda_ffi_radix_lwe_right = prepare_cuda_radix_ffi(
+        radix_lwe_right,
+        &mut radix_lwe_right_degrees,
+        &mut radix_lwe_right_noise_levels,
+    );
     scratch_cuda_integer_mult_radix_ciphertext_kb_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
@@ -657,10 +707,10 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
         streams.len() as u32,
-        radix_lwe_left.as_mut_c_ptr(0),
-        radix_lwe_left.as_c_ptr(0),
+        &mut cuda_ffi_radix_lwe_left,
+        &cuda_ffi_radix_lwe_left,
         is_boolean_left,
-        radix_lwe_right.as_c_ptr(0),
+        &cuda_ffi_radix_lwe_right,
         is_boolean_right,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
@@ -674,6 +724,7 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
     );
+    update_noise_degree(radix_lwe_left, &cuda_ffi_radix_lwe_left);
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -2727,8 +2778,8 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
     B: Numeric,
 >(
     streams: &CudaStreams,
-    result: &mut CudaVec<T>,
-    radix_list: &mut CudaVec<T>,
+    result: &mut CudaRadixCiphertext,
+    radix_list: &mut CudaRadixCiphertext,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
     message_modulus: MessageModulus,
@@ -2747,12 +2798,12 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        result.gpu_index(0),
+        result.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_list.gpu_index(0),
+        radix_list.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -2766,6 +2817,22 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
         "GPU error: all data should reside on the same GPU."
     );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut result_degrees = result.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut result_noise_levels = result.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let mut cuda_ffi_result =
+        prepare_cuda_radix_ffi(result, &mut result_degrees, &mut result_noise_levels);
+    let mut radix_list_degrees = radix_list.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut radix_list_noise_levels = radix_list
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_radix_list = prepare_cuda_radix_ffi(
+        radix_list,
+        &mut radix_list_degrees,
+        &mut radix_list_noise_levels,
+    );
     scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
@@ -2790,13 +2857,11 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
         streams.len() as u32,
-        result.as_mut_c_ptr(0),
-        radix_list.as_mut_c_ptr(0),
-        num_radixes,
+        &mut cuda_ffi_result,
+        &mut cuda_ffi_radix_list,
         mem_ptr,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
-        num_blocks,
     );
     cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
         streams.ptr.as_ptr(),
@@ -2804,6 +2869,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
     );
+    update_noise_degree(result, &cuda_ffi_result);
 }
 
 #[allow(clippy::too_many_arguments)]
diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs
index 2ba76c13ee..86336eead1 100644
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -1,9 +1,9 @@
-use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::{
-    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
+    CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
+    CudaUnsignedRadixCiphertext,
 };
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
@@ -242,6 +242,7 @@ impl CudaServerKey {
             streams,
             0,
         );
+        result.as_mut().info = ciphertexts[0].as_ref().info.clone();
         if ciphertexts.len() == 1 {
             return;
         }
@@ -262,19 +263,14 @@ impl CudaServerKey {
 
         let radix_count_in_vec = ciphertexts.len();
 
-        let mut terms = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list(
-            ciphertexts
-                .iter()
-                .map(|ciphertext| &ciphertext.as_ref().d_blocks),
-            streams,
-        );
+        let mut terms = CudaRadixCiphertext::from_radix_ciphertext_vec(ciphertexts, streams);
 
         match &self.bootstrapping_key {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
                     streams,
-                    &mut result.as_mut().d_blocks.0.d_vec,
-                    &mut terms.0.d_vec,
+                    result.as_mut(),
+                    &mut terms,
                     &d_bsk.d_vec,
                     &self.key_switching_key.d_vec,
                     self.message_modulus,
@@ -297,8 +293,8 @@ impl CudaServerKey {
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                 unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
                     streams,
-                    &mut result.as_mut().d_blocks.0.d_vec,
-                    &mut terms.0.d_vec,
+                    result.as_mut(),
+                    &mut terms,
                     &d_multibit_bsk.d_vec,
                     &self.key_switching_key.d_vec,
                     self.message_modulus,
diff --git a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
index 2acf8ce9fe..9aa6571c20 100644
--- a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
@@ -162,6 +162,22 @@ impl CudaServerKey {
                 .as_mut_slice((i * lwe_size)..((i + 1) * lwe_size), 0)
                 .unwrap();
             dest_slice.copy_from_gpu_async(&src_slice, streams, 0);
+            for b in new_item.ciphertext.info.blocks.iter_mut() {
+                b.degree = leading_count_per_blocks
+                    .as_ref()
+                    .info
+                    .blocks
+                    .get(i)
+                    .unwrap()
+                    .degree;
+                b.noise_level = leading_count_per_blocks
+                    .as_ref()
+                    .info
+                    .blocks
+                    .get(i)
+                    .unwrap()
+                    .noise_level;
+            }
             cts.push(new_item);
         }
 
@@ -404,6 +420,22 @@ impl CudaServerKey {
                 .as_mut_slice((i * lwe_size)..((i + 1) * lwe_size), 0)
                 .unwrap();
             dest_slice.copy_from_gpu_async(&src_slice, streams, 0);
+            for b in new_item.ciphertext.info.blocks.iter_mut() {
+                b.degree = leading_zeros_per_blocks
+                    .as_ref()
+                    .info
+                    .blocks
+                    .get(i)
+                    .unwrap()
+                    .degree;
+                b.noise_level = leading_zeros_per_blocks
+                    .as_ref()
+                    .info
+                    .blocks
+                    .get(i)
+                    .unwrap()
+                    .noise_level;
+            }
             cts.push(new_item);
         }
 
@@ -490,9 +522,7 @@ impl CudaServerKey {
 
         let result = self.sum_ciphertexts_async(ciphertexts, streams).unwrap();
 
-        let mut result_cast = self.cast_to_unsigned_async(result, counter_num_blocks, streams);
-        result_cast.as_mut().info = ct.as_ref().info.after_ilog2();
-        result_cast
+        self.cast_to_unsigned_async(result, counter_num_blocks, streams)
     }
 
     /// Returns the number of trailing zeros in the binary representation of `ct`
diff --git a/tfhe/src/integer/gpu/server_key/radix/mul.rs b/tfhe/src/integer/gpu/server_key/radix/mul.rs
index 5f199ed990..3541647003 100644
--- a/tfhe/src/integer/gpu/server_key/radix/mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mul.rs
@@ -81,9 +81,9 @@ impl CudaServerKey {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 unchecked_mul_integer_radix_kb_assign_async(
                     streams,
-                    &mut ct_left.as_mut().d_blocks.0.d_vec,
+                    ct_left.as_mut(),
                     is_boolean_left,
-                    &ct_right.as_ref().d_blocks.0.d_vec,
+                    ct_right.as_ref(),
                     is_boolean_right,
                     &d_bsk.d_vec,
                     &self.key_switching_key.d_vec,
@@ -104,9 +104,9 @@ impl CudaServerKey {
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                 unchecked_mul_integer_radix_kb_assign_async(
                     streams,
-                    &mut ct_left.as_mut().d_blocks.0.d_vec,
+                    ct_left.as_mut(),
                     is_boolean_left,
-                    &ct_right.as_ref().d_blocks.0.d_vec,
+                    ct_right.as_ref(),
                     is_boolean_right,
                     &d_multibit_bsk.d_vec,
                     &self.key_switching_key.d_vec,
@@ -125,8 +125,6 @@ impl CudaServerKey {
                 );
             }
         }
-
-        ct_left.as_mut().info = ct_left.as_ref().info.after_mul();
     }
 
     pub fn unchecked_mul_assign<T: CudaIntegerRadixCiphertext>(
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
index 0e2ba4633f..c1515eb688 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
@@ -116,7 +116,7 @@ impl CudaServerKey {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 unchecked_scalar_mul_integer_radix_kb_async(
                     streams,
-                    &mut ct.as_mut().d_blocks.0.d_vec,
+                    ct.as_mut(),
                     decomposed_scalar.as_slice(),
                     has_at_least_one_set.as_slice(),
                     &d_bsk.d_vec,
@@ -132,7 +132,6 @@ impl CudaServerKey {
                     d_bsk.decomp_level_count,
                     self.key_switching_key.decomposition_base_log(),
                     self.key_switching_key.decomposition_level_count(),
-                    num_blocks as u32,
                     decomposed_scalar.len() as u32,
                     PBSType::Classical,
                     LweBskGroupingFactor(0),
@@ -141,7 +140,7 @@ impl CudaServerKey {
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                 unchecked_scalar_mul_integer_radix_kb_async(
                     streams,
-                    &mut ct.as_mut().d_blocks.0.d_vec,
+                    ct.as_mut(),
                     decomposed_scalar.as_slice(),
                     has_at_least_one_set.as_slice(),
                     &d_multibit_bsk.d_vec,
@@ -157,15 +156,12 @@ impl CudaServerKey {
                     d_multibit_bsk.decomp_level_count,
                     self.key_switching_key.decomposition_base_log(),
                     self.key_switching_key.decomposition_level_count(),
-                    num_blocks as u32,
                     decomposed_scalar.len() as u32,
                     PBSType::MultiBit,
                     d_multibit_bsk.grouping_factor,
                 );
             }
         }
-
-        ct.as_mut().info = ct.as_ref().info.after_scalar_mul();
     }
 
     pub fn unchecked_scalar_mul_assign<Scalar, T>(
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
index 035ac19484..a45d67e072 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
@@ -78,7 +78,6 @@ impl CudaServerKey {
     {
         let negated_scalar = scalar.twos_complement_negation();
         self.unchecked_scalar_add_assign_async(ct, negated_scalar, streams);
-        ct.as_mut().info = ct.as_ref().info.after_scalar_sub(scalar);
     }
 
     pub fn unchecked_scalar_sub_assign<Scalar, T>(