Skip to content

Commit

Permalink
chore(gpu): minor improvement on the LUT generation function and in
Browse files Browse the repository at this point in the history
are_all_comparisons_block_true()
  • Loading branch information
pdroalves authored and agnesLeroy committed Feb 24, 2024
1 parent f38a9a9 commit 753c7aa
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 42 deletions.
6 changes: 6 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ void cuda_drop_async(void *ptr, cuda_stream_t *stream);
int cuda_get_max_shared_memory(uint32_t gpu_index);

void cuda_synchronize_stream(cuda_stream_t *stream);

void cuda_stream_add_callback(cuda_stream_t *stream,
cudaStreamCallback_t callback, void *user_data);

void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer);
}

template <typename Torus>
Expand Down
34 changes: 8 additions & 26 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -850,8 +850,10 @@ template <typename Torus> struct int_are_all_block_true_buffer {
COMPARISON_TYPE op;
int_radix_params params;

int_radix_lut<Torus> *is_max_value_lut;
int_radix_lut<Torus> *is_equal_to_num_blocks_lut;
// This map store LUTs that checks the equality between some input and values
// of interest in are_all_block_true(), as with max_value (the maximum message
// value).
std::unordered_map<int, int_radix_lut<Torus> *> is_equal_to_lut_map;

Torus *tmp_block_accumulated;

Expand All @@ -869,34 +871,14 @@ template <typename Torus> struct int_are_all_block_true_buffer {
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
tmp_block_accumulated = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * max_chunks * sizeof(Torus), stream);

// LUT
// We need three LUTs:
// (x & max_value as u64) == max_value
// x != 0
// (x & max_value as u64) == blocks.len()

auto is_max_value_lut_f = [total_modulus](Torus x) -> Torus {
Torus max_value = total_modulus - 1;
return (x & max_value) == max_value;
};

is_max_value_lut = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
is_equal_to_num_blocks_lut = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
generate_device_accumulator<Torus>(
stream, is_max_value_lut->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
is_max_value_lut_f);
}
}

void release(cuda_stream_t *stream) {
is_max_value_lut->release(stream);
delete is_max_value_lut;
is_equal_to_num_blocks_lut->release(stream);
delete is_equal_to_num_blocks_lut;
for (auto &lut : is_equal_to_lut_map) {
lut.second->release(stream);
}
is_equal_to_lut_map.clear();

cuda_drop_async(tmp_block_accumulated, stream);
}
Expand Down
12 changes: 12 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,15 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
}

void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }

void cuda_stream_add_callback(cuda_stream_t *stream,
cudaStreamCallback_t callback, void *user_data) {

check_cuda_error(
cudaStreamAddCallback(stream->stream, callback, user_data, 0));
}

void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer) {
free(host_pointer);
}
25 changes: 15 additions & 10 deletions backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -99,29 +99,34 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
input_blocks += (big_lwe_dimension + 1) * chunk_length;
}
accumulator = are_all_block_true_buffer->tmp_block_accumulated;
auto is_equal_to_num_blocks_map =
&are_all_block_true_buffer->is_equal_to_lut_map;

// Selects a LUT
int_radix_lut<Torus> *lut;
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
// is_non_zero_lut_buffer LUT
lut = mem_ptr->eq_buffer->is_non_zero_lut;
} else if (chunk_length == max_value) {
// is_max_value LUT
lut = are_all_block_true_buffer->is_max_value_lut;
} else {
// is_equal_to_num_blocks LUT
lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
if (chunk_length != lut_num_blocks) {
if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
(*is_equal_to_num_blocks_map).end()) {
// The LUT is already computed
lut = (*is_equal_to_num_blocks_map)[chunk_length];
} else {
// LUT needs to be computed
auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
num_radix_blocks, true);

auto is_equal_to_num_blocks_lut_f = [max_value,
chunk_length](Torus x) -> Torus {
return (x & max_value) == chunk_length;
};
generate_device_accumulator<Torus>(
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, is_equal_to_num_blocks_lut_f);
stream, new_lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);

// We don't have to generate this lut again
lut_num_blocks = chunk_length;
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
lut = new_lut;
}
}

Expand Down
12 changes: 6 additions & 6 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,8 @@ void generate_device_accumulator_bivariate(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);

cuda_synchronize_stream(stream);
free(h_lut);
// Release memory when possible
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
}

/*
Expand Down Expand Up @@ -271,8 +271,8 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream);

cuda_synchronize_stream(stream);
free(h_lut);
// Release memory when possible
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
}

template <typename Torus>
Expand Down Expand Up @@ -461,8 +461,8 @@ void scratch_cuda_full_propagation(
h_lwe_indexes[i] = i;
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
stream);
cuda_synchronize_stream(stream);
free(h_lwe_indexes);
cuda_stream_add_callback(stream, host_free_on_stream_callback,
h_lwe_indexes);
}

// Temporary arrays
Expand Down

0 comments on commit 753c7aa

Please sign in to comment.