fix(gpu): fix compression on multi-gpu

zama-ai · Sep 3, 2024 · 207f689 · 207f689
1 parent 358bcc9
commit 207f689
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 17 deletions.
diff --git a/.github/workflows/gpu_fast_h100_tests.yml b/.github/workflows/gpu_fast_h100_tests.yml
@@ -147,6 +147,7 @@ jobs:
       - name: Run core crypto and internal CUDA backend tests
         run: |
           BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
           BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
 
       - name: Run user docs tests

diff --git a/.github/workflows/gpu_fast_tests.yml b/.github/workflows/gpu_fast_tests.yml
@@ -145,6 +145,7 @@ jobs:
       - name: Run core crypto and internal CUDA backend tests
         run: |
           make test_core_crypto_gpu
+          make test_integer_compression_gpu
           make test_cuda_backend
 
       - name: Run user docs tests

diff --git a/.github/workflows/gpu_full_multi_gpu_tests.yml b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -144,6 +144,10 @@ jobs:
         if: ${{ !cancelled() }}
         run: nvidia-smi
 
+      - name: Run multi-bit CUDA integer compression tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
+
       # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
       - name: Run multi-bit CUDA integer tests
         run: |

diff --git a/Makefile b/Makefile
@@ -481,6 +481,13 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
 
+.PHONY: test_integer_compression_gpu
+test_integer_compression_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext --test-threads=6
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
+
 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -27,17 +27,16 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
     uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
     PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
 
+  // Decompression doesn't keyswitch, so big and small dimensions are the same
   int_radix_params encryption_params(
       pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
-      (encryption_glwe_dimension + 1) * encryption_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
+      message_modulus, carry_modulus);
 
   int_radix_params compression_params(
       pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      (compression_glwe_dimension + 1) * compression_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
+      0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
 
   scratch_cuda_integer_decompress_radix_ciphertext_64(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -199,17 +199,58 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
   /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
   /// dimension to a big LWE dimension
   auto encryption_params = mem_ptr->encryption_params;
-  auto carry_extract_lut = mem_ptr->carry_extract_lut;
-  execute_pbs_async<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out,
-      carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
-      carry_extract_lut->lut_indexes_vec, extracted_lwe,
-      carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
-      encryption_params.glwe_dimension,
-      compression_params.glwe_dimension * compression_params.polynomial_size,
-      encryption_params.polynomial_size, encryption_params.pbs_base_log,
-      encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
-      encryption_params.pbs_type);
+  auto lut = mem_ptr->carry_extract_lut;
+  auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_out,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
+        lut->lwe_indexes_in, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+  } else {
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
+        compression_params.small_lwe_dimension + 1);
+
+    /// Apply PBS
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes, num_lwes,
+                                      encryption_params.big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
 }
 
 template <typename Torus>