Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(gpu): fix compression #1485

Merged
merged 1 commit into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/gpu_fast_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

- name: Run user docs tests
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/gpu_fast_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
make test_core_crypto_gpu
make test_integer_compression_gpu
make test_cuda_backend

- name: Run user docs tests
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/gpu_full_multi_gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ jobs:
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run multi-bit CUDA integer compression tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu

# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
- name: Run multi-bit CUDA integer tests
run: |
Expand Down
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,13 @@ test_integer_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

.PHONY: test_integer_compression_gpu
test_integer_compression_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::test_gpu_ciphertext_compression
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress

.PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
Expand Down Expand Up @@ -883,6 +890,12 @@ bench_integer_gpu: install_rs_check_toolchain
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

.PHONY: bench_integer_compression_gpu
bench_integer_compression_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench glwe_packing_compression-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --

.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
Expand Down
11 changes: 7 additions & 4 deletions backends/tfhe-cuda-backend/cuda/include/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory);

void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Expand Down Expand Up @@ -94,6 +95,7 @@ template <typename Torus> struct int_decompression {

uint32_t storage_log_modulus;

uint32_t num_lwes;
uint32_t body_count;

Torus *tmp_extracted_glwe;
Expand All @@ -104,12 +106,13 @@ template <typename Torus> struct int_decompression {
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
uint32_t num_radix_blocks, uint32_t body_count,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
this->num_lwes = num_radix_blocks;
this->body_count = body_count;

if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,25 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory) {

// Decompression doesn't keyswitch, so big and small dimensions are the same
int_radix_params encryption_params(
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
(encryption_glwe_dimension + 1) * encryption_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
message_modulus, carry_modulus);

int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);

scratch_cuda_integer_decompress_radix_ciphertext_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
compression_params, storage_log_modulus, allocate_gpu_memory);
(int_decompression<uint64_t> **)mem_ptr, num_lwes, body_count,
encryption_params, compression_params, storage_log_modulus,
allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
auto log_modulus = mem_ptr->storage_log_modulus;
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
auto number_bits_to_pack = in_len * log_modulus;

auto nbits = sizeof(Torus) * 8;
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto len = (number_bits_to_pack + nbits - 1) / nbits;
Expand Down Expand Up @@ -80,6 +79,7 @@ __host__ void host_integer_compress(cudaStream_t *streams,
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);

// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
Expand All @@ -92,11 +92,9 @@ __host__ void host_integer_compress(cudaStream_t *streams,
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
compression_params.polynomial_size, compression_params.ks_base_log,
compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
compression_params.ks_level, body_count);
}

auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);

// Modulus switch
host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes *
Expand Down Expand Up @@ -156,15 +154,15 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
auto log_modulus = mem_ptr->storage_log_modulus;

uint32_t body_count = mem_ptr->body_count;

auto initial_out_len =
params.glwe_dimension * params.polynomial_size + body_count * body_count;
params.glwe_dimension * params.polynomial_size + body_count;

// We assure the tail of the glwe is zeroed
auto zeroed_slice =
glwe_array_out + params.glwe_dimension * params.polynomial_size;
cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
auto zeroed_slice = glwe_array_out + initial_out_len;
cuda_memset_async(zeroed_slice, 0,
(params.polynomial_size - body_count) * sizeof(Torus),
stream, gpu_index);

int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
Expand All @@ -187,7 +185,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
mem_ptr);

auto num_lwes = mem_ptr->body_count;
auto num_lwes = mem_ptr->num_lwes;

// Sample extract
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
Expand All @@ -199,17 +197,58 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
auto encryption_params = mem_ptr->encryption_params;
auto carry_extract_lut = mem_ptr->carry_extract_lut;
execute_pbs_async<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
carry_extract_lut->lut_indexes_vec, extracted_lwe,
carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
encryption_params.glwe_dimension,
compression_params.glwe_dimension * compression_params.polynomial_size,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
encryption_params.pbs_type);
auto lut = mem_ptr->carry_extract_lut;
auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
if (active_gpu_count == 1) {
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_out,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
lut->lwe_indexes_in, bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_lwes, encryption_params.pbs_type);
} else {
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
compression_params.small_lwe_dimension + 1);

/// Apply PBS
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_lwes, encryption_params.pbs_type);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_lwes,
encryption_params.big_lwe_dimension + 1);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}

template <typename Torus>
Expand All @@ -227,12 +266,12 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
template <typename Torus>
__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
int_decompression<Torus> **mem_ptr, uint32_t num_lwes, uint32_t body_count,
int_radix_params encryption_params, int_radix_params compression_params,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {

*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_lwes, storage_log_modulus, allocate_gpu_memory);
num_lwes, body_count, storage_log_modulus, allocate_gpu_memory);
}
#endif
1 change: 1 addition & 0 deletions backends/tfhe-cuda-backend/src/cuda_bind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ extern "C" {
carry_modulus: u32,
pbs_type: u32,
storage_log_modulus: u32,
bodies_count: u32,
allocate_gpu_memory: bool,
);

Expand Down
103 changes: 44 additions & 59 deletions tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,79 +138,64 @@ mod tests {
use super::*;
use crate::integer::gpu::gen_keys_radix_gpu;
use crate::integer::ClientKey;
use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;

const NB_TESTS: usize = 10;
#[test]
fn test_gpu_ciphertext_compression() {
let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);

let private_compression_key =
cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);

let streams = CudaStreams::new_multi_gpu();

let num_blocks = 4;
let num_blocks = 32;
let (radix_cks, _) = gen_keys_radix_gpu(
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
num_blocks,
&streams,
);

let (cuda_compression_key, cuda_decompression_key) =
radix_cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams);

let ct1 = radix_cks.encrypt(3_u32);
let ct2 = radix_cks.encrypt(2_u32);
let ct3 = radix_cks.encrypt_signed(-2);
let ct4 = cks.encrypt_bool(true);

// Copy to GPU
let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &streams);
let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &streams);
let d_ct4 = CudaBooleanBlock::from_boolean_block(&ct4, &streams);

let cuda_compressed = CudaCompressedCiphertextListBuilder::new()
.push(d_ct1, &streams)
.push(d_ct2, &streams)
.push(d_ct3, &streams)
.push(d_ct4, &streams)
.build(&cuda_compression_key, &streams);

let d_decompressed1 = CudaUnsignedRadixCiphertext {
ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams),
};

let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams);
let decrypted: u32 = radix_cks.decrypt(&decompressed1);

assert_eq!(decrypted, 3_u32);
let d_decompressed2 = CudaUnsignedRadixCiphertext {
ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams),
};

let decompressed2 = d_decompressed2.to_radix_ciphertext(&streams);
let decrypted: u32 = radix_cks.decrypt(&decompressed2);

assert_eq!(decrypted, 2_u32);
let d_decompressed3 = CudaSignedRadixCiphertext {
ciphertext: cuda_compressed.get(2, &cuda_decompression_key, &streams),
};

let decompressed3 = d_decompressed3.to_signed_radix_ciphertext(&streams);
let decrypted: i32 = radix_cks.decrypt_signed(&decompressed3);

assert_eq!(decrypted, -2);
let d_decompressed4 = CudaBooleanBlock::from_cuda_radix_ciphertext(cuda_compressed.get(
3,
&cuda_decompression_key,
&streams,
));

let decompressed4 = d_decompressed4.to_boolean_block(&streams);
let decrypted = radix_cks.decrypt_bool(&decompressed4);

assert!(decrypted);
for _ in 0..NB_TESTS {
let ct1 = radix_cks.encrypt(3_u32);
let ct2 = radix_cks.encrypt_signed(-2);
let ct3 = radix_cks.encrypt_bool(true);

// Copy to GPU
let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
let d_ct2 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct2, &streams);
let d_ct3 = CudaBooleanBlock::from_boolean_block(&ct3, &streams);

let cuda_compressed = CudaCompressedCiphertextListBuilder::new()
.push(d_ct1, &streams)
.push(d_ct2, &streams)
.push(d_ct3, &streams)
.build(&cuda_compression_key, &streams);

let d_decompressed1 = CudaUnsignedRadixCiphertext {
ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams),
};
let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams);
let decrypted: u32 = radix_cks.decrypt(&decompressed1);
assert_eq!(decrypted, 3_u32);

let d_decompressed2 = CudaSignedRadixCiphertext {
ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams),
};
let decompressed2 = d_decompressed2.to_signed_radix_ciphertext(&streams);
let decrypted: i32 = radix_cks.decrypt_signed(&decompressed2);
assert_eq!(decrypted, -2);

let d_decompressed3 = CudaBooleanBlock::from_cuda_radix_ciphertext(
cuda_compressed.get(2, &cuda_decompression_key, &streams),
);
let decompressed3 = d_decompressed3.to_boolean_block(&streams);
let decrypted = radix_cks.decrypt_bool(&decompressed3);
assert!(decrypted);
}
}
}
Loading
Loading