Skip to content

Commit

Permalink
feat(gpu): Implement 128 bit classic pbs
Browse files Browse the repository at this point in the history
  • Loading branch information
bbarbakadze committed Mar 5, 2025
1 parent c1d534e commit 7f3ac17
Show file tree
Hide file tree
Showing 11 changed files with 407 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);

void cuda_convert_lwe_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);

void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
Expand Down
14 changes: 7 additions & 7 deletions backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct alignas(16) f128 {
#else
double s = a + b;
return f128(s, b - (s - a));
#endif;
#endif
}

// Two-sum
Expand Down Expand Up @@ -270,7 +270,7 @@ __host__ __device__ inline double bits_to_double(uint64_t bits) {
return d;
}

__host__ __device__ double u128_to_f64(__uint128_t x) {
__host__ __device__ inline double u128_to_f64(__uint128_t x) {
const __uint128_t ONE = 1;
const double A = ONE << 52;
const double B = ONE << 104;
Expand Down Expand Up @@ -322,7 +322,7 @@ __host__ __device__ double u128_to_f64(__uint128_t x) {
}
}

__host__ __device__ __uint128_t f64_to_u128(const double f) {
__host__ __device__ inline __uint128_t f64_to_u128(const double f) {
const __uint128_t ONE = 1;
const uint64_t f_bits = double_to_bits(f);
if (f_bits < 1023ull << 52) {
Expand All @@ -338,7 +338,7 @@ __host__ __device__ __uint128_t f64_to_u128(const double f) {
}
}

__host__ __device__ __uint128_t f64_to_i128(const double f) {
__host__ __device__ inline __uint128_t f64_to_i128(const double f) {
// Get raw bits of the double
const uint64_t f_bits = double_to_bits(f);

Expand Down Expand Up @@ -366,14 +366,14 @@ __host__ __device__ __uint128_t f64_to_i128(const double f) {
return (f_bits >> 63) ? -result : result;
}

__host__ __device__ double i128_to_f64(__int128_t const x) {
__host__ __device__ inline double i128_to_f64(__int128_t const x) {
uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
__uint128_t abs =
(x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);

return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
}
__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
__host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
const double first_approx = i128_to_f64(x);
const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
const __uint128_t first_approx_roundtrip =
Expand All @@ -387,7 +387,7 @@ __host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
return f128(first_approx, correction);
}

__host__ __device__ __uint128_t u128_from_torus_f128(const f128 &a) {
__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
auto x = f128::sub_estimate(a, f128::f128_floor(a));
const double normalization = 340282366920938500000000000000000000000.;
#ifdef __CUDA_ARCH__
Expand Down
10 changes: 5 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ __global__ void
batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
double *in_im_lo, double *out_re_hi, double *out_re_lo,
double *out_im_hi, double *out_im_lo, double *buffer) {
extern __shared__ double sharedMemoryFFT[];
extern __shared__ double sharedMemoryFFT128[];
double *re_hi, *re_lo, *im_hi, *im_lo;

if (SMD == NOSM) {
Expand All @@ -322,10 +322,10 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
im_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
} else {
re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
}

Index tid = threadIdx.x;
Expand Down
13 changes: 13 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
(const int64_t *)src, polynomial_size, total_polynomials);
}

void cuda_convert_lwe_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
printf("bsk transform.cu\n");

uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_programmable_bootstrap_key_u128(
static_cast<cudaStream_t>(stream), gpu_index, (double *)dest,
(const __uint128_t *)src, polynomial_size, total_polynomials);
}

void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
Expand Down
113 changes: 113 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft128/fft128.cuh"

#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include "polynomial/parameters.cuh"
Expand Down Expand Up @@ -250,5 +252,116 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
cuda_drop_async(buffer, stream, gpu_index);
cudaFreeHost(h_bsk);
}
template <int N>
__global__ void dprint_array(double *a) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (int i = 0; i < N; i++)
printf("%.30f\n", a[i]);
}
}
template <class params>
void convert_and_transform_128(cudaStream_t stream,
uint32_t gpu_index, double *d_re0, double *d_re1, double *d_im0,
double *d_im1,
__uint128_t const *d_standard, uint32_t number_of_samples) {

printf("bsk transform\n");
size_t required_shared_memory_size = sizeof(double) * params::degree / 2 * 4;
int grid_size = number_of_samples;
int block_size = params::degree / params::opt;
bool full_sm =
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * params::degree / 2 * 4;
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);

// configure shared memory for batch fft kernel
if (full_sm) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
}

// convert u128 into 4 x double
batch_convert_u128_to_f128_as_torus<params>
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
d_standard);

// call negacyclic 128 bit forward fft.
if (full_sm) {
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
} else {
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
}
cuda_drop_async(buffer, stream, gpu_index);

printf("#cuda\n");
cudaDeviceSynchronize();

dprint_array<params::degree / 2><<<1, 1>>>(d_re0);

}



inline void cuda_convert_lwe_programmable_bootstrap_key_u128(cudaStream_t stream,
uint32_t gpu_index,
double *dest, __uint128_t const *src,
uint32_t polynomial_size,
uint32_t total_polynomials) {
cuda_set_device(gpu_index);

// Here the buffer size is the size of double times the number of polynomials time 4
// each polynomial is represented with 4 double array with size polynomial_size / 2
// into the complex domain to perform the FFT
size_t buffer_size =
total_polynomials * polynomial_size / 2 * sizeof(double) * 4;

__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
double *d_bsk = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);

double *d_re0 = d_bsk + 0ULL * total_polynomials * polynomial_size / 2;
double *d_re1 = d_bsk + 1ULL * total_polynomials * polynomial_size / 2;
double *d_im0 = d_bsk + 2ULL * total_polynomials * polynomial_size / 2;
double *d_im1 = d_bsk + 3ULL * total_polynomials * polynomial_size / 2;

cuda_memcpy_async_to_gpu(d_standard, src, buffer_size,
stream, gpu_index);



switch (polynomial_size) {
case 256:
convert_and_transform_128<AmortizedDegree<256>>(stream, gpu_index, d_re0, d_re1, d_im0,
d_im1,
d_standard, total_polynomials);
case 512:
convert_and_transform_128<AmortizedDegree<512>>(stream, gpu_index, d_re0, d_re1, d_im0, d_im1, d_standard, total_polynomials);
break;
case 1024:
convert_and_transform_128<AmortizedDegree<1024>>(stream, gpu_index, d_re0, d_re1, d_im0, d_im1, d_standard, total_polynomials);
break;
case 2048:
convert_and_transform_128<AmortizedDegree<2048>>(stream, gpu_index, d_re0, d_re1, d_im0, d_im1, d_standard, total_polynomials);
break;
case 4096:
convert_and_transform_128<AmortizedDegree<4096>>(stream, gpu_index, d_re0, d_re1, d_im0, d_im1, d_standard, total_polynomials);
break;
default:
PANIC("Cuda error (convert BSK): unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..4096].")
}

cuda_drop_async(d_standard, stream, gpu_index);
cuda_drop_async(d_bsk, stream, gpu_index);
}


#endif // CNCRT_BSK_H
12 changes: 12 additions & 0 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1431,6 +1431,18 @@ unsafe extern "C" {
polynomial_size: u32,
);
}
unsafe extern "C" {
pub fn cuda_convert_lwe_programmable_bootstrap_key_128(
stream: *mut ffi::c_void,
gpu_index: u32,
dest: *mut ffi::c_void,
src: *const ffi::c_void,
input_lwe_dim: u32,
glwe_dim: u32,
level_count: u32,
polynomial_size: u32,
);
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_amortized_32(
stream: *mut ffi::c_void,
Expand Down
4 changes: 4 additions & 0 deletions tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,10 @@ where
fourier_im1,
coef_poly.as_ref(),
);
println!("re0: {:?}", fourier_re0);
println!("re1: {:?}", fourier_re1);
println!("im0: {:?}", fourier_im0);
println!("im1: {:?}", fourier_im1);
}
}
implementation(self.as_mut_view(), coef_ggsw.as_view(), fft);
Expand Down
Loading

0 comments on commit 7f3ac17

Please sign in to comment.