fix(gpu): fix compression on multi-gpu

zama-ai · Aug 21, 2024 · 63ffe07 · 63ffe07
1 parent 358bcc9
commit 63ffe07
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 17 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -27,17 +27,16 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
     uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
     PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
 
+  // Decompression doesn't keyswitch, so big and small dimensions are the same
   int_radix_params encryption_params(
       pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
-      (encryption_glwe_dimension + 1) * encryption_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
+      message_modulus, carry_modulus);
 
   int_radix_params compression_params(
       pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      (compression_glwe_dimension + 1) * compression_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
+      0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
 
   scratch_cuda_integer_decompress_radix_ciphertext_64(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -199,17 +199,62 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
   /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
   /// dimension to a big LWE dimension
   auto encryption_params = mem_ptr->encryption_params;
-  auto carry_extract_lut = mem_ptr->carry_extract_lut;
-  execute_pbs_async<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out,
-      carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
-      carry_extract_lut->lut_indexes_vec, extracted_lwe,
-      carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
-      encryption_params.glwe_dimension,
-      compression_params.glwe_dimension * compression_params.polynomial_size,
-      encryption_params.polynomial_size, encryption_params.pbs_base_log,
-      encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
-      encryption_params.pbs_type);
+  auto lut = mem_ptr->carry_extract_lut;
+  printf("encryption_params: ");
+  encryption_params.print();
+  printf("compression_params: ");
+  compression_params.print();
+  auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_out,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
+        lut->lwe_indexes_in, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+  } else {
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
+        compression_params.small_lwe_dimension + 1);
+
+    /// Apply PBS
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes, num_lwes,
+                                      encryption_params.big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
 }
 
 template <typename Torus>