zama-ai · agnesLeroy · Sep 10, 2024 · Aug 21, 2024
diff --git a/.github/workflows/gpu_fast_h100_tests.yml b/.github/workflows/gpu_fast_h100_tests.yml
@@ -147,6 +147,7 @@ jobs:
       - name: Run core crypto and internal CUDA backend tests
         run: |
           BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
           BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
 
       - name: Run user docs tests

diff --git a/.github/workflows/gpu_fast_tests.yml b/.github/workflows/gpu_fast_tests.yml
@@ -145,6 +145,7 @@ jobs:
       - name: Run core crypto and internal CUDA backend tests
         run: |
           make test_core_crypto_gpu
+          make test_integer_compression_gpu
           make test_cuda_backend
 
       - name: Run user docs tests

diff --git a/.github/workflows/gpu_full_multi_gpu_tests.yml b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -144,6 +144,10 @@ jobs:
         if: ${{ !cancelled() }}
         run: nvidia-smi
 
+      - name: Run multi-bit CUDA integer compression tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
+
       # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
       - name: Run multi-bit CUDA integer tests
         run: |

diff --git a/Makefile b/Makefile
@@ -481,6 +481,13 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
 
+.PHONY: test_integer_compression_gpu
+test_integer_compression_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::test_gpu_ciphertext_compression
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
+
 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -883,6 +890,12 @@ bench_integer_gpu: install_rs_check_toolchain
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
 
+.PHONY: bench_integer_compression_gpu
+bench_integer_compression_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench	glwe_packing_compression-integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \

diff --git a/backends/tfhe-cuda-backend/cuda/include/compression.h b/backends/tfhe-cuda-backend/cuda/include/compression.h
@@ -18,7 +18,8 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
     uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
     uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
     uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
+    bool allocate_gpu_memory);
 
 void cuda_integer_compress_radix_ciphertext_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -94,6 +95,7 @@ template <typename Torus> struct int_decompression {
 
   uint32_t storage_log_modulus;
 
+  uint32_t num_lwes;
   uint32_t body_count;
 
   Torus *tmp_extracted_glwe;
@@ -104,12 +106,13 @@ template <typename Torus> struct int_decompression {
   int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
                     uint32_t gpu_count, int_radix_params encryption_params,
                     int_radix_params compression_params,
-                    uint32_t num_radix_blocks, uint32_t storage_log_modulus,
-                    bool allocate_gpu_memory) {
+                    uint32_t num_radix_blocks, uint32_t body_count,
+                    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
     this->encryption_params = encryption_params;
     this->compression_params = compression_params;
     this->storage_log_modulus = storage_log_modulus;
-    this->body_count = num_radix_blocks;
+    this->num_lwes = num_radix_blocks;
+    this->body_count = body_count;
 
     if (allocate_gpu_memory) {
       Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -25,24 +25,25 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
     uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
     uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
     uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
+    bool allocate_gpu_memory) {
 
+  // Decompression doesn't keyswitch, so big and small dimensions are the same
   int_radix_params encryption_params(
       pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
-      (encryption_glwe_dimension + 1) * encryption_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
+      message_modulus, carry_modulus);
 
   int_radix_params compression_params(
       pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      (compression_glwe_dimension + 1) * compression_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
+      0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
 
   scratch_cuda_integer_decompress_radix_ciphertext_64(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
-      compression_params, storage_log_modulus, allocate_gpu_memory);
+      (int_decompression<uint64_t> **)mem_ptr, num_lwes, body_count,
+      encryption_params, compression_params, storage_log_modulus,
+      allocate_gpu_memory);
 }
 void cuda_integer_compress_radix_ciphertext_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -45,7 +45,6 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
   auto log_modulus = mem_ptr->storage_log_modulus;
   auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
   auto number_bits_to_pack = in_len * log_modulus;
-
   auto nbits = sizeof(Torus) * 8;
   // number_bits_to_pack.div_ceil(Scalar::BITS)
   auto len = (number_bits_to_pack + nbits - 1) / nbits;
@@ -80,6 +79,7 @@ __host__ void host_integer_compress(cudaStream_t *streams,
   uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
                            compression_params.polynomial_size;
   uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
+  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
 
   // Keyswitch LWEs to GLWE
   auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -92,11 +92,9 @@ __host__ void host_integer_compress(cudaStream_t *streams,
         streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
         fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
         compression_params.polynomial_size, compression_params.ks_base_log,
-        compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
+        compression_params.ks_level, body_count);
   }
 
-  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
-
   // Modulus switch
   host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
                               num_glwes *
@@ -156,15 +154,15 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
   auto log_modulus = mem_ptr->storage_log_modulus;
 
   uint32_t body_count = mem_ptr->body_count;
+
   auto initial_out_len =
-      params.glwe_dimension * params.polynomial_size + body_count * body_count;
+      params.glwe_dimension * params.polynomial_size + body_count;
 
   // We assure the tail of the glwe is zeroed
-  auto zeroed_slice =
-      glwe_array_out + params.glwe_dimension * params.polynomial_size;
-  cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
+  auto zeroed_slice = glwe_array_out + initial_out_len;
+  cuda_memset_async(zeroed_slice, 0,
+                    (params.polynomial_size - body_count) * sizeof(Torus),
                     stream, gpu_index);
-
   int num_blocks = 0, num_threads = 0;
   getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
   dim3 grid(num_blocks);
@@ -187,7 +185,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
   host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
                mem_ptr);
 
-  auto num_lwes = mem_ptr->body_count;
+  auto num_lwes = mem_ptr->num_lwes;
 
   // Sample extract
   auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
@@ -199,17 +197,58 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
   /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
   /// dimension to a big LWE dimension
   auto encryption_params = mem_ptr->encryption_params;
-  auto carry_extract_lut = mem_ptr->carry_extract_lut;
-  execute_pbs_async<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out,
-      carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
-      carry_extract_lut->lut_indexes_vec, extracted_lwe,
-      carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
-      encryption_params.glwe_dimension,
-      compression_params.glwe_dimension * compression_params.polynomial_size,
-      encryption_params.polynomial_size, encryption_params.pbs_base_log,
-      encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
-      encryption_params.pbs_type);
+  auto lut = mem_ptr->carry_extract_lut;
+  auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_out,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
+        lut->lwe_indexes_in, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+  } else {
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
+        compression_params.small_lwe_dimension + 1);
+
+    /// Apply PBS
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes, num_lwes,
+                                      encryption_params.big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
 }
 
 template <typename Torus>
@@ -227,12 +266,12 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
 template <typename Torus>
 __host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
     cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
+    int_decompression<Torus> **mem_ptr, uint32_t num_lwes, uint32_t body_count,
     int_radix_params encryption_params, int_radix_params compression_params,
     uint32_t storage_log_modulus, bool allocate_gpu_memory) {
 
   *mem_ptr = new int_decompression<Torus>(
       streams, gpu_indexes, gpu_count, encryption_params, compression_params,
-      num_lwes, storage_log_modulus, allocate_gpu_memory);
+      num_lwes, body_count, storage_log_modulus, allocate_gpu_memory);
 }
 #endif
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -120,6 +120,7 @@ extern "C" {
         carry_modulus: u32,
         pbs_type: u32,
         storage_log_modulus: u32,
+        bodies_count: u32,
         allocate_gpu_memory: bool,
     );
 

diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
@@ -138,79 +138,64 @@ mod tests {
     use super::*;
     use crate::integer::gpu::gen_keys_radix_gpu;
     use crate::integer::ClientKey;
-    use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
-    use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
+    use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
 
+    const NB_TESTS: usize = 10;
     #[test]
     fn test_gpu_ciphertext_compression() {
-        let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
+        let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
 
         let private_compression_key =
-            cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
+            cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
 
         let streams = CudaStreams::new_multi_gpu();
 
-        let num_blocks = 4;
+        let num_blocks = 32;
         let (radix_cks, _) = gen_keys_radix_gpu(
-            PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
+            PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
             num_blocks,
             &streams,
         );
-
         let (cuda_compression_key, cuda_decompression_key) =
             radix_cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams);
 
-        let ct1 = radix_cks.encrypt(3_u32);
-        let ct2 = radix_cks.encrypt(2_u32);
-        let ct3 = radix_cks.encrypt_signed(-2);
-        let ct4 = cks.encrypt_bool(true);
-
-        // Copy to GPU
-        let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
-        let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &streams);
-        let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &streams);
-        let d_ct4 = CudaBooleanBlock::from_boolean_block(&ct4, &streams);
-
-        let cuda_compressed = CudaCompressedCiphertextListBuilder::new()
-            .push(d_ct1, &streams)
-            .push(d_ct2, &streams)
-            .push(d_ct3, &streams)
-            .push(d_ct4, &streams)
-            .build(&cuda_compression_key, &streams);
-
-        let d_decompressed1 = CudaUnsignedRadixCiphertext {
-            ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams),
-        };
-
-        let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams);
-        let decrypted: u32 = radix_cks.decrypt(&decompressed1);
-
-        assert_eq!(decrypted, 3_u32);
-        let d_decompressed2 = CudaUnsignedRadixCiphertext {
-            ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams),
-        };
-
-        let decompressed2 = d_decompressed2.to_radix_ciphertext(&streams);
-        let decrypted: u32 = radix_cks.decrypt(&decompressed2);
-
-        assert_eq!(decrypted, 2_u32);
-        let d_decompressed3 = CudaSignedRadixCiphertext {
-            ciphertext: cuda_compressed.get(2, &cuda_decompression_key, &streams),
-        };
-
-        let decompressed3 = d_decompressed3.to_signed_radix_ciphertext(&streams);
-        let decrypted: i32 = radix_cks.decrypt_signed(&decompressed3);
-
-        assert_eq!(decrypted, -2);
-        let d_decompressed4 = CudaBooleanBlock::from_cuda_radix_ciphertext(cuda_compressed.get(
-            3,
-            &cuda_decompression_key,
-            &streams,
-        ));
-
-        let decompressed4 = d_decompressed4.to_boolean_block(&streams);
-        let decrypted = radix_cks.decrypt_bool(&decompressed4);
-
-        assert!(decrypted);
+        for _ in 0..NB_TESTS {
+            let ct1 = radix_cks.encrypt(3_u32);
+            let ct2 = radix_cks.encrypt_signed(-2);
+            let ct3 = radix_cks.encrypt_bool(true);
+
+            // Copy to GPU
+            let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
+            let d_ct2 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct2, &streams);
+            let d_ct3 = CudaBooleanBlock::from_boolean_block(&ct3, &streams);
+
+            let cuda_compressed = CudaCompressedCiphertextListBuilder::new()
+                .push(d_ct1, &streams)
+                .push(d_ct2, &streams)
+                .push(d_ct3, &streams)
+                .build(&cuda_compression_key, &streams);
+
+            let d_decompressed1 = CudaUnsignedRadixCiphertext {
+                ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams),
+            };
+            let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams);
+            let decrypted: u32 = radix_cks.decrypt(&decompressed1);
+            assert_eq!(decrypted, 3_u32);
+
+            let d_decompressed2 = CudaSignedRadixCiphertext {
+                ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams),
+            };
+            let decompressed2 = d_decompressed2.to_signed_radix_ciphertext(&streams);
+            let decrypted: i32 = radix_cks.decrypt_signed(&decompressed2);
+            assert_eq!(decrypted, -2);
+
+            let d_decompressed3 = CudaBooleanBlock::from_cuda_radix_ciphertext(
+                cuda_compressed.get(2, &cuda_decompression_key, &streams),
+            );
+            let decompressed3 = d_decompressed3.to_boolean_block(&streams);
+            let decrypted = radix_cks.decrypt_bool(&decompressed3);
+            assert!(decrypted);
+        }
     }
 }