chore(gpu): minor improvement on the LUT generation function and in

are_all_comparisons_block_true()
zama-ai · Feb 24, 2024 · 753c7aa · 753c7aa
1 parent f38a9a9
commit 753c7aa
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 42 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -80,6 +80,12 @@ void cuda_drop_async(void *ptr, cuda_stream_t *stream);
 int cuda_get_max_shared_memory(uint32_t gpu_index);
 
 void cuda_synchronize_stream(cuda_stream_t *stream);
+
+void cuda_stream_add_callback(cuda_stream_t *stream,
+                              cudaStreamCallback_t callback, void *user_data);
+
+void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
+                                  void *host_pointer);
 }
 
 template <typename Torus>

diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -850,8 +850,10 @@ template <typename Torus> struct int_are_all_block_true_buffer {
   COMPARISON_TYPE op;
   int_radix_params params;
 
-  int_radix_lut<Torus> *is_max_value_lut;
-  int_radix_lut<Torus> *is_equal_to_num_blocks_lut;
+  // This map store LUTs that checks the equality between some input and values
+  // of interest in are_all_block_true(), as with max_value (the maximum message
+  // value).
+  std::unordered_map<int, int_radix_lut<Torus> *> is_equal_to_lut_map;
 
   Torus *tmp_block_accumulated;
 
@@ -869,34 +871,14 @@ template <typename Torus> struct int_are_all_block_true_buffer {
       int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
       tmp_block_accumulated = (Torus *)cuda_malloc_async(
           (params.big_lwe_dimension + 1) * max_chunks * sizeof(Torus), stream);
-
-      // LUT
-      // We need three LUTs:
-      // (x & max_value as u64) == max_value
-      // x != 0
-      // (x & max_value as u64) == blocks.len()
-
-      auto is_max_value_lut_f = [total_modulus](Torus x) -> Torus {
-        Torus max_value = total_modulus - 1;
-        return (x & max_value) == max_value;
-      };
-
-      is_max_value_lut = new int_radix_lut<Torus>(
-          stream, params, 1, num_radix_blocks, allocate_gpu_memory);
-      is_equal_to_num_blocks_lut = new int_radix_lut<Torus>(
-          stream, params, 1, num_radix_blocks, allocate_gpu_memory);
-      generate_device_accumulator<Torus>(
-          stream, is_max_value_lut->lut, params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          is_max_value_lut_f);
     }
   }
 
   void release(cuda_stream_t *stream) {
-    is_max_value_lut->release(stream);
-    delete is_max_value_lut;
-    is_equal_to_num_blocks_lut->release(stream);
-    delete is_equal_to_num_blocks_lut;
+    for (auto &lut : is_equal_to_lut_map) {
+      lut.second->release(stream);
+    }
+    is_equal_to_lut_map.clear();
 
     cuda_drop_async(tmp_block_accumulated, stream);
   }

diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -228,3 +228,15 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
 }
 
 void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
+
+void cuda_stream_add_callback(cuda_stream_t *stream,
+                              cudaStreamCallback_t callback, void *user_data) {
+
+  check_cuda_error(
+      cudaStreamAddCallback(stream->stream, callback, user_data, 0));
+}
+
+void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
+                                  void *host_pointer) {
+  free(host_pointer);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -99,29 +99,34 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
       input_blocks += (big_lwe_dimension + 1) * chunk_length;
     }
     accumulator = are_all_block_true_buffer->tmp_block_accumulated;
+    auto is_equal_to_num_blocks_map =
+        &are_all_block_true_buffer->is_equal_to_lut_map;
 
     // Selects a LUT
     int_radix_lut<Torus> *lut;
     if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
       // is_non_zero_lut_buffer LUT
       lut = mem_ptr->eq_buffer->is_non_zero_lut;
-    } else if (chunk_length == max_value) {
-      // is_max_value LUT
-      lut = are_all_block_true_buffer->is_max_value_lut;
     } else {
-      // is_equal_to_num_blocks LUT
-      lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
-      if (chunk_length != lut_num_blocks) {
+      if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
+          (*is_equal_to_num_blocks_map).end()) {
+        // The LUT is already computed
+        lut = (*is_equal_to_num_blocks_map)[chunk_length];
+      } else {
+        // LUT needs to be computed
+        auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
+                                                num_radix_blocks, true);
+
         auto is_equal_to_num_blocks_lut_f = [max_value,
                                              chunk_length](Torus x) -> Torus {
           return (x & max_value) == chunk_length;
         };
         generate_device_accumulator<Torus>(
-            stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-            carry_modulus, is_equal_to_num_blocks_lut_f);
+            stream, new_lut->lut, glwe_dimension, polynomial_size,
+            message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);
 
-        // We don't have to generate this lut again
-        lut_num_blocks = chunk_length;
+        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
+        lut = new_lut;
       }
     }
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -239,8 +239,8 @@ void generate_device_accumulator_bivariate(
       acc_bivariate, h_lut,
       (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
 
-  cuda_synchronize_stream(stream);
-  free(h_lut);
+  // Release memory when possible
+  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
 }
 
 /*
@@ -271,8 +271,8 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
       acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
       stream);
 
-  cuda_synchronize_stream(stream);
-  free(h_lut);
+  // Release memory when possible
+  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
 }
 
 template <typename Torus>
@@ -461,8 +461,8 @@ void scratch_cuda_full_propagation(
       h_lwe_indexes[i] = i;
     cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
                              stream);
-    cuda_synchronize_stream(stream);
-    free(h_lwe_indexes);
+    cuda_stream_add_callback(stream, host_free_on_stream_callback,
+                             h_lwe_indexes);
   }
 
   // Temporary arrays