use consistent threads per block 256

st-tech · Jan 13, 2025 · fc5b58d · fc5b58d
1 parent 2015b78
commit fc5b58d
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 18 deletions.
diff --git a/src/cpp/common.hpp b/src/cpp/common.hpp
@@ -30,6 +30,7 @@
 #define FLT_MIN -1.0e8f
 #define DT_MIN 1e-5f
 #define PI 3.14159265358979323846f
+#define BLOCK_SIZE 256
 #define DEBUG_MODE 0
 
 namespace logging {

diff --git a/src/cpp/csrmat/csrmat.cu b/src/cpp/csrmat/csrmat.cu
@@ -10,8 +10,6 @@
 #include "../utility/utility.hpp"
 #include "csrmat.hpp"
 
-#define BLOCK_SIZE 1024
-
 __device__ void Row::alloc() {
     head = 0;
     ref_head = 0;
@@ -185,23 +183,15 @@ __global__ void add_block_offsets_kernel(unsigned *d_data,
 }
 
 unsigned exclusive_scan(unsigned *d_data, unsigned n) {
-    const unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    const unsigned extra_scale = 2;
+    static unsigned _n = n;
+    assert(_n == n);
+    static unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
     static unsigned *d_block_sums = nullptr;
     static unsigned *h_block_sums = nullptr;
-    static unsigned max_num_blocks = 0;
     if (d_block_sums == nullptr) {
-        max_num_blocks = extra_scale * num_blocks;
-        CUDA_HANDLE_ERROR(cudaMalloc((void **)&d_block_sums,
-                                     max_num_blocks * sizeof(unsigned)));
-        h_block_sums = new unsigned[max_num_blocks];
-    } else if (max_num_blocks < num_blocks) {
-        max_num_blocks = extra_scale * num_blocks;
-        CUDA_HANDLE_ERROR(cudaFree(d_block_sums));
         CUDA_HANDLE_ERROR(cudaMalloc((void **)&d_block_sums,
-                                     max_num_blocks * sizeof(unsigned)));
-        delete[] h_block_sums;
-        h_block_sums = new unsigned[max_num_blocks];
+                                     num_blocks * sizeof(unsigned)));
+        h_block_sums = new unsigned[num_blocks];
     }
     block_scan_kernel<<<num_blocks, BLOCK_SIZE,
                         BLOCK_SIZE * sizeof(unsigned)>>>(d_data, d_block_sums,

diff --git a/src/cpp/utility/dispatcher.cu b/src/cpp/utility/dispatcher.cu
@@ -1,3 +1,5 @@
+#include "../common.hpp"
+
 template <typename Lambda> __global__ void launch_kernel(Lambda func, int n) {
     unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < n) {
@@ -8,7 +10,7 @@ template <typename Lambda> __global__ void launch_kernel(Lambda func, int n) {
 #define DISPATCH_START(n)                                                      \
     {                                                                          \
         const unsigned n_threads(n);                                           \
-        const unsigned block_size = 256;                                       \
+        const unsigned block_size = BLOCK_SIZE;                                \
         const unsigned grid_size = (n_threads + block_size - 1) / block_size;  \
     launch_kernel<<<grid_size, block_size>>>(
 #define DISPATCH_END , n_threads);                                             \

diff --git a/src/cpp/utility/utility.cu b/src/cpp/utility/utility.cu
@@ -14,8 +14,6 @@
 #include "eig-hpp/eigsolve2x2.hpp"
 #include "eig-hpp/eigsolve3x3.hpp"
 
-#define BLOCK_SIZE 256
-
 namespace utility {
 
 __device__ Vec3f compute_vertex_normal(const DataSet &data,