feat(gpu): Implement 128 bit classic pbs

zama-ai · Mar 5, 2025 · 7f3ac17 · 7f3ac17
1 parent c1d534e
commit 7f3ac17
Show file tree

Hide file tree

Showing 11 changed files with 407 additions and 25 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -20,6 +20,11 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
     uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
     uint32_t polynomial_size);
 
+void cuda_convert_lwe_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);
+
 void scratch_cuda_programmable_bootstrap_amortized_32(
     void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size,

diff --git a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -21,7 +21,7 @@ struct alignas(16) f128 {
 #else
     double s = a + b;
     return f128(s, b - (s - a));
-#endif;
+#endif
   }
 
   // Two-sum
@@ -270,7 +270,7 @@ __host__ __device__ inline double bits_to_double(uint64_t bits) {
   return d;
 }
 
-__host__ __device__ double u128_to_f64(__uint128_t x) {
+__host__ __device__ inline double u128_to_f64(__uint128_t x) {
   const __uint128_t ONE = 1;
   const double A = ONE << 52;
   const double B = ONE << 104;
@@ -322,7 +322,7 @@ __host__ __device__ double u128_to_f64(__uint128_t x) {
   }
 }
 
-__host__ __device__ __uint128_t f64_to_u128(const double f) {
+__host__ __device__ inline __uint128_t f64_to_u128(const double f) {
   const __uint128_t ONE = 1;
   const uint64_t f_bits = double_to_bits(f);
   if (f_bits < 1023ull << 52) {
@@ -338,7 +338,7 @@ __host__ __device__ __uint128_t f64_to_u128(const double f) {
   }
 }
 
-__host__ __device__ __uint128_t f64_to_i128(const double f) {
+__host__ __device__ inline __uint128_t f64_to_i128(const double f) {
   // Get raw bits of the double
   const uint64_t f_bits = double_to_bits(f);
 
@@ -366,14 +366,14 @@ __host__ __device__ __uint128_t f64_to_i128(const double f) {
   return (f_bits >> 63) ? -result : result;
 }
 
-__host__ __device__ double i128_to_f64(__int128_t const x) {
+__host__ __device__ inline double i128_to_f64(__int128_t const x) {
   uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
   __uint128_t abs =
       (x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);
 
   return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
 }
-__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
+__host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
   const double first_approx = i128_to_f64(x);
   const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
   const __uint128_t first_approx_roundtrip =
@@ -387,7 +387,7 @@ __host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
   return f128(first_approx, correction);
 }
 
-__host__ __device__ __uint128_t u128_from_torus_f128(const f128 &a) {
+__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
   auto x = f128::sub_estimate(a, f128::f128_floor(a));
   const double normalization = 340282366920938500000000000000000000000.;
 #ifdef __CUDA_ARCH__

diff --git a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -309,7 +309,7 @@ __global__ void
 batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
                  double *in_im_lo, double *out_re_hi, double *out_re_lo,
                  double *out_im_hi, double *out_im_lo, double *buffer) {
-  extern __shared__ double sharedMemoryFFT[];
+  extern __shared__ double sharedMemoryFFT128[];
   double *re_hi, *re_lo, *im_hi, *im_lo;
 
   if (SMD == NOSM) {
@@ -322,10 +322,10 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
     im_lo =
         &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
   } else {
-    re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
-    re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
-    im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
-    im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
+    re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
+    re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
+    im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
+    im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
   }
 
   Index tid = threadIdx.x;

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -22,6 +22,19 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
       (const int64_t *)src, polynomial_size, total_polynomials);
 }
 
+void cuda_convert_lwe_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size) {
+  printf("bsk transform.cu\n");
+
+  uint32_t total_polynomials =
+      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
+  cuda_convert_lwe_programmable_bootstrap_key_u128(
+      static_cast<cudaStream_t>(stream), gpu_index, (double *)dest,
+      (const __uint128_t *)src, polynomial_size, total_polynomials);
+}
+
 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
     void *stream, uint32_t gpu_index, void *dest, void const *src,
     uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -3,6 +3,8 @@
 
 #include "device.h"
 #include "fft/bnsmfft.cuh"
+#include "fft128/fft128.cuh"
+
 #include "pbs/programmable_bootstrap.h"
 #include "pbs/programmable_bootstrap_multibit.h"
 #include "polynomial/parameters.cuh"
@@ -250,5 +252,116 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
   cuda_drop_async(buffer, stream, gpu_index);
   cudaFreeHost(h_bsk);
 }
+template <int N>
+__global__ void dprint_array(double *a) {
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    for (int i = 0; i < N; i++)
+      printf("%.30f\n", a[i]);
+  }
+}
+template <class params>
+void convert_and_transform_128(cudaStream_t stream,
+                               uint32_t gpu_index, double *d_re0, double *d_re1, double *d_im0,
+                               double *d_im1,
+                               __uint128_t const *d_standard, uint32_t number_of_samples) {
+
+  printf("bsk transform\n");
+  size_t required_shared_memory_size = sizeof(double) * params::degree / 2 * 4;
+  int grid_size = number_of_samples;
+  int block_size = params::degree / params::opt;
+  bool full_sm =
+      (required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
+  size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * params::degree / 2 * 4;
+  size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
+  double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  // configure shared memory for batch fft kernel
+  if (full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+
+  // convert u128 into 4 x double
+  batch_convert_u128_to_f128_as_torus<params>
+  <<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
+                                         d_standard);
+
+  // call negacyclic 128 bit forward fft.
+  if (full_sm) {
+    batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
+    <<<grid_size, block_size, shared_memory_size, stream>>>(
+        d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
+  } else {
+    batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
+    <<<grid_size, block_size, shared_memory_size, stream>>>(
+        d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
+  }
+  cuda_drop_async(buffer, stream, gpu_index);
+
+  printf("#cuda\n");
+  cudaDeviceSynchronize();
+
+  dprint_array<params::degree / 2><<<1, 1>>>(d_re0);
+
+}
+
+
+
+inline void cuda_convert_lwe_programmable_bootstrap_key_u128(cudaStream_t stream,
+                                                 uint32_t gpu_index,
+                                                 double *dest, __uint128_t const *src,
+                                                 uint32_t polynomial_size,
+                                                 uint32_t total_polynomials) {
+  cuda_set_device(gpu_index);
+
+  // Here the buffer size is the size of double times the number of polynomials time 4
+  // each polynomial is represented with 4 double array with size polynomial_size / 2
+  // into the complex domain to perform the FFT
+  size_t buffer_size =
+      total_polynomials * polynomial_size / 2 * sizeof(double) * 4;
+
+  __uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
+  double *d_bsk = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  double *d_re0 = d_bsk + 0ULL * total_polynomials * polynomial_size / 2;
+  double *d_re1 = d_bsk + 1ULL * total_polynomials * polynomial_size / 2;
+  double *d_im0 = d_bsk + 2ULL * total_polynomials * polynomial_size / 2;
+  double *d_im1 = d_bsk + 3ULL * total_polynomials * polynomial_size / 2;
+
+  cuda_memcpy_async_to_gpu(d_standard, src, buffer_size,
+                           stream, gpu_index);
+
+
+
+  switch (polynomial_size) {
+    case 256:
+      convert_and_transform_128<AmortizedDegree<256>>(stream, gpu_index, d_re0, d_re1, d_im0,
+          d_im1,
+          d_standard, total_polynomials);
+    case 512:
+      convert_and_transform_128<AmortizedDegree<512>>(stream, gpu_index, d_re0, d_re1, d_im0, d_im1, d_standard, total_polynomials);
+      break;
+    case 1024:
+      convert_and_transform_128<AmortizedDegree<1024>>(stream, gpu_index, d_re0, d_re1, d_im0, d_im1, d_standard, total_polynomials);
+      break;
+    case 2048:
+      convert_and_transform_128<AmortizedDegree<2048>>(stream, gpu_index, d_re0, d_re1, d_im0, d_im1, d_standard, total_polynomials);
+      break;
+    case 4096:
+      convert_and_transform_128<AmortizedDegree<4096>>(stream, gpu_index, d_re0, d_re1, d_im0, d_im1, d_standard, total_polynomials);
+      break;
+    default:
+    PANIC("Cuda error (convert BSK): unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..4096].")
+  }
+
+  cuda_drop_async(d_standard, stream, gpu_index);
+  cuda_drop_async(d_bsk, stream, gpu_index);
+}
+
 
 #endif // CNCRT_BSK_H
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1431,6 +1431,18 @@ unsafe extern "C" {
         polynomial_size: u32,
     );
 }
+unsafe extern "C" {
+    pub fn cuda_convert_lwe_programmable_bootstrap_key_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        dest: *mut ffi::c_void,
+        src: *const ffi::c_void,
+        input_lwe_dim: u32,
+        glwe_dim: u32,
+        level_count: u32,
+        polynomial_size: u32,
+    );
+}
 unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_amortized_32(
         stream: *mut ffi::c_void,

diff --git a/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs b/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
@@ -328,6 +328,10 @@ where
                     fourier_im1,
                     coef_poly.as_ref(),
                 );
+                println!("re0: {:?}", fourier_re0);
+                println!("re1: {:?}", fourier_re1);
+                println!("im0: {:?}", fourier_im0);
+                println!("im1: {:?}", fourier_im1);
             }
         }
         implementation(self.as_mut_view(), coef_ggsw.as_view(), fft);