Skip to content

Commit

Permalink
chore(gpu): refactor full propagation to track noise / degree
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Mar 5, 2025
1 parent f7655cc commit d4a1ded
Show file tree
Hide file tree
Showing 11 changed files with 92 additions and 62 deletions.
3 changes: 2 additions & 1 deletion backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ void scratch_cuda_full_propagation_64(

void cuda_full_propagation_64_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *input_blocks,
uint32_t gpu_count,
CudaRadixCiphertextFFI *input_blocks,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks);

Expand Down
29 changes: 14 additions & 15 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -841,8 +841,8 @@ template <typename Torus> struct int_fullprop_buffer {

int_radix_lut<Torus> *lut;

Torus *tmp_small_lwe_vector;
Torus *tmp_big_lwe_vector;
CudaRadixCiphertextFFI *tmp_small_lwe_vector;
CudaRadixCiphertextFFI *tmp_big_lwe_vector;

int_fullprop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
Expand Down Expand Up @@ -889,17 +889,14 @@ template <typename Torus> struct int_fullprop_buffer {

lut->broadcast_lut(streams, gpu_indexes, 0);

// Temporary arrays
Torus small_vector_size =
2 * (params.small_lwe_dimension + 1) * sizeof(Torus);
Torus big_vector_size =
2 * (params.glwe_dimension * params.polynomial_size + 1) *
sizeof(Torus);

tmp_small_lwe_vector = (Torus *)cuda_malloc_async(
small_vector_size, streams[0], gpu_indexes[0]);
tmp_big_lwe_vector = (Torus *)cuda_malloc_async(
big_vector_size, streams[0], gpu_indexes[0]);
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_small_lwe_vector, 2,
params.small_lwe_dimension);
tmp_big_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_big_lwe_vector, 2,
params.big_lwe_dimension);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lwe_indexes);
}
Expand All @@ -911,8 +908,10 @@ template <typename Torus> struct int_fullprop_buffer {
lut->release(streams, gpu_indexes, 1);
delete lut;

cuda_drop_async(tmp_small_lwe_vector, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_big_lwe_vector, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_small_lwe_vector);
delete tmp_small_lwe_vector;
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_big_lwe_vector);
delete tmp_big_lwe_vector;
}
};

Expand Down
10 changes: 5 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@

void cuda_full_propagation_64_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *input_blocks,
uint32_t gpu_count,
CudaRadixCiphertextFFI *input_blocks,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks) {

int_fullprop_buffer<uint64_t> *buffer =
(int_fullprop_buffer<uint64_t> *)mem_ptr;

host_full_propagate_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks), buffer, (uint64_t **)(ksks), bsks,
num_blocks);
host_full_propagate_inplace<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, input_blocks, buffer,
(uint64_t **)(ksks), bsks, num_blocks);
}

void scratch_cuda_full_propagation_64(
Expand Down
52 changes: 34 additions & 18 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
lut->num_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < num_radix_blocks; i++) {
lwe_array_out->degrees[i] = lut->degrees[lut_indexes[i]];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
Expand Down Expand Up @@ -964,6 +965,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
lut->num_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
lwe_array_out->degrees[i] = lut->degrees[i % lut->num_blocks];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
Expand Down Expand Up @@ -1173,6 +1175,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
lut->num_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < num_radix_blocks; i++) {
lwe_array_out->degrees[i] = lut->degrees[lut_indexes[i]];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
Expand Down Expand Up @@ -1974,7 +1977,8 @@ void host_compute_shifted_blocks_and_borrow_states(
template <typename Torus>
void host_full_propagate_inplace(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *input_blocks,
uint32_t gpu_count,
CudaRadixCiphertextFFI *input_blocks,
int_fullprop_buffer<Torus> *mem_ptr,
Torus *const *ksks, void *const *bsks,
uint32_t num_blocks) {
Expand All @@ -1987,39 +1991,51 @@ void host_full_propagate_inplace(cudaStream_t const *streams,
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (int i = 0; i < num_blocks; i++) {
auto cur_input_block = &input_blocks[i * big_lwe_size];
CudaRadixCiphertextFFI cur_input_block;
as_radix_ciphertext_slice<Torus>(&cur_input_block, input_blocks, i, i + 1);

/// Since the keyswitch is done on one input only, use only 1 GPU
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
streams, gpu_indexes, 1, (Torus *)(mem_ptr->tmp_small_lwe_vector->ptr),
mem_ptr->lut->lwe_trivial_indexes, (Torus *)cur_input_block.ptr,
mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);

cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
mem_ptr->tmp_small_lwe_vector,
small_lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector, 1, 2,
mem_ptr->tmp_small_lwe_vector, 0, 1);

execute_pbs_async<Torus>(
streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
streams, gpu_indexes, 1, (Torus *)mem_ptr->tmp_big_lwe_vector->ptr,
mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
mem_ptr->lut->lut_indexes_vec,
(Torus *)mem_ptr->tmp_small_lwe_vector->ptr,
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
params.glwe_dimension, params.small_lwe_dimension,
params.polynomial_size, params.pbs_base_log, params.pbs_level,
params.grouping_factor, 2, params.pbs_type, num_many_lut, lut_stride);

cuda_memcpy_async_gpu_to_gpu(
(void *)cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
&cur_input_block, 0, 1,
mem_ptr->tmp_big_lwe_vector, 0, 1);
Torus lut_indexes[mem_ptr->lut->num_blocks];
cuda_memcpy_async_to_cpu(&lut_indexes, mem_ptr->lut->get_lut_indexes(0, 0),
mem_ptr->lut->num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
input_blocks->degrees[i] = mem_ptr->lut->degrees[lut_indexes[0]];
input_blocks->noise_levels[i] = NoiseLevel::NOMINAL;

if (i < num_blocks - 1) {
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
legacy_host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
(Torus const *)next_input_block,
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
params.big_lwe_dimension, 1);
CudaRadixCiphertextFFI next_input_block;
as_radix_ciphertext_slice<Torus>(&next_input_block, input_blocks, i + 1,
i + 2);
CudaRadixCiphertextFFI second_input;
as_radix_ciphertext_slice<Torus>(&second_input,
mem_ptr->tmp_big_lwe_vector, 1, 2);

host_addition<Torus>(streams[0], gpu_indexes[0], &next_input_block,
&next_input_block, &second_input, 1);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ unsafe extern "C" {
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
input_blocks: *mut ffi::c_void,
input_blocks: *mut CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
ksks: *const *mut ffi::c_void,
bsks: *const *mut ffi::c_void,
Expand Down
24 changes: 21 additions & 3 deletions tfhe/src/integer/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1249,7 +1249,7 @@ pub unsafe fn unchecked_scalar_comparison_integer_radix_kb_async<T: UnsignedInte
/// is required
pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_input: &mut CudaVec<T>,
radix_lwe_input: &mut CudaRadixCiphertext,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
lwe_dimension: LweDimension,
Expand All @@ -1267,7 +1267,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
) {
assert_eq!(
streams.gpu_indexes[0],
radix_lwe_input.gpu_index(0),
radix_lwe_input.d_blocks.0.d_vec.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
Expand All @@ -1281,6 +1281,23 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
"GPU error: all data should reside on the same GPU."
);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let mut radix_lwe_input_degrees = radix_lwe_input
.info
.blocks
.iter()
.map(|b| b.degree.0)
.collect();
let mut radix_lwe_input_noise_levels = radix_lwe_input
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let mut cuda_ffi_radix_lwe_input = prepare_cuda_radix_ffi(
radix_lwe_input,
&mut radix_lwe_input_degrees,
&mut radix_lwe_input_noise_levels,
);
scratch_cuda_full_propagation_64(
streams.ptr.as_ptr(),
streams.gpu_indexes_ptr(),
Expand All @@ -1303,7 +1320,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
streams.ptr.as_ptr(),
streams.gpu_indexes_ptr(),
streams.len() as u32,
radix_lwe_input.as_mut_c_ptr(0),
&mut cuda_ffi_radix_lwe_input,
mem_ptr,
keyswitch_key.ptr.as_ptr(),
bootstrapping_key.ptr.as_ptr(),
Expand All @@ -1315,6 +1332,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
);
update_noise_degree(radix_lwe_input, &cuda_ffi_radix_lwe_input);
}

#[allow(clippy::too_many_arguments)]
Expand Down
10 changes: 1 addition & 9 deletions tfhe/src/integer/gpu/server_key/radix/add.rs
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ impl CudaServerKey {
let output_flag = OutputFlag::from_signedness(CudaSignedRadixCiphertext::IS_SIGNED);

let mut ct_res = lhs.duplicate_async(stream);
let mut carry_out: CudaSignedRadixCiphertext = self
let carry_out: CudaSignedRadixCiphertext = self
.add_and_propagate_single_carry_assign_async(
&mut ct_res,
rhs,
Expand All @@ -578,14 +578,6 @@ impl CudaServerKey {
output_flag,
);

if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
&& rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
{
carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::ZERO);
} else {
carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::NOMINAL);
}

let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext);

(ct_res, ct_overflowed)
Expand Down
12 changes: 2 additions & 10 deletions tfhe/src/integer/gpu/server_key/radix/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ impl CudaServerKey {
CudaBootstrappingKey::Classic(d_bsk) => {
full_propagate_assign_async(
streams,
&mut ciphertext.d_blocks.0.d_vec,
ciphertext,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
d_bsk.input_lwe_dimension(),
Expand All @@ -403,7 +403,7 @@ impl CudaServerKey {
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
full_propagate_assign_async(
streams,
&mut ciphertext.d_blocks.0.d_vec,
ciphertext,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
d_multibit_bsk.input_lwe_dimension(),
Expand All @@ -422,14 +422,6 @@ impl CudaServerKey {
}
}
}
ciphertext.info.blocks.iter_mut().for_each(|b| {
b.degree = Degree::new(b.message_modulus.0 - 1);
b.noise_level = if b.noise_level == NoiseLevel::ZERO {
NoiseLevel::ZERO
} else {
NoiseLevel::NOMINAL
};
});
}

/// Prepend trivial zero LSB blocks to an existing [`CudaUnsignedRadixCiphertext`] or
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,10 @@ where
expected overflow flag {expected_overflowed}, got {decrypted_overflowed}"
);
assert_eq!(encrypted_overflow.0.degree.get(), 1);
#[cfg(feature = "gpu")]
assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::NOMINAL);

#[cfg(not(feature = "gpu"))]
assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::ZERO);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@ where
expected overflow flag {expected_overflowed}, got {decrypted_overflowed}"
);
assert_eq!(encrypted_overflow.0.degree.get(), 1);
#[cfg(feature = "gpu")]
assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::NOMINAL);

#[cfg(not(feature = "gpu"))]
assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::ZERO);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,10 @@ where
expected overflow flag {expected_overflowed}, got {decrypted_overflowed}"
);
assert_eq!(encrypted_overflow.0.degree.get(), 1);
#[cfg(feature = "gpu")]
assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::NOMINAL);

#[cfg(not(feature = "gpu"))]
assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::ZERO);
}
}
Expand Down

0 comments on commit d4a1ded

Please sign in to comment.