From fc5b58de675f2dd2bb40b1f7802083c723634c11 Mon Sep 17 00:00:00 2001
From: Ryoichi Ando <ryich.ando@gmail.com>
Date: Mon, 13 Jan 2025 15:05:05 +0900
Subject: [PATCH] use consistent threads per block 256

---
 src/cpp/common.hpp            |  1 +
 src/cpp/csrmat/csrmat.cu      | 20 +++++---------------
 src/cpp/utility/dispatcher.cu |  4 +++-
 src/cpp/utility/utility.cu    |  2 --
 4 files changed, 9 insertions(+), 18 deletions(-)
diff --git a/src/cpp/common.hpp b/src/cpp/common.hpp
index 9fbde46..076d5c9 100644
--- a/src/cpp/common.hpp
+++ b/src/cpp/common.hpp
@@ -30,6 +30,7 @@
 #define FLT_MIN -1.0e8f
 #define DT_MIN 1e-5f
 #define PI 3.14159265358979323846f
+#define BLOCK_SIZE 256
 #define DEBUG_MODE 0
 
 namespace logging {
diff --git a/src/cpp/csrmat/csrmat.cu b/src/cpp/csrmat/csrmat.cu
index 73445ba..b628735 100644
--- a/src/cpp/csrmat/csrmat.cu
+++ b/src/cpp/csrmat/csrmat.cu
@@ -10,8 +10,6 @@
 #include "../utility/utility.hpp"
 #include "csrmat.hpp"
 
-#define BLOCK_SIZE 1024
-
 __device__ void Row::alloc() {
     head = 0;
     ref_head = 0;
@@ -185,23 +183,15 @@ __global__ void add_block_offsets_kernel(unsigned *d_data,
 }
 
 unsigned exclusive_scan(unsigned *d_data, unsigned n) {
-    const unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    const unsigned extra_scale = 2;
+    static unsigned _n = n;
+    assert(_n == n);
+    static unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
     static unsigned *d_block_sums = nullptr;
     static unsigned *h_block_sums = nullptr;
-    static unsigned max_num_blocks = 0;
     if (d_block_sums == nullptr) {
-        max_num_blocks = extra_scale * num_blocks;
-        CUDA_HANDLE_ERROR(cudaMalloc((void **)&d_block_sums,
-                                     max_num_blocks * sizeof(unsigned)));
-        h_block_sums = new unsigned[max_num_blocks];
-    } else if (max_num_blocks < num_blocks) {
-        max_num_blocks = extra_scale * num_blocks;
-        CUDA_HANDLE_ERROR(cudaFree(d_block_sums));
         CUDA_HANDLE_ERROR(cudaMalloc((void **)&d_block_sums,
-                                     max_num_blocks * sizeof(unsigned)));
-        delete[] h_block_sums;
-        h_block_sums = new unsigned[max_num_blocks];
+                                     num_blocks * sizeof(unsigned)));
+        h_block_sums = new unsigned[num_blocks];
     }
     block_scan_kernel<<<num_blocks, BLOCK_SIZE,
                         BLOCK_SIZE * sizeof(unsigned)>>>(d_data, d_block_sums,
diff --git a/src/cpp/utility/dispatcher.cu b/src/cpp/utility/dispatcher.cu
index 9c2f407..aa34fe5 100644
--- a/src/cpp/utility/dispatcher.cu
+++ b/src/cpp/utility/dispatcher.cu
@@ -1,3 +1,5 @@
+#include "../common.hpp"
+
 template <typename Lambda> __global__ void launch_kernel(Lambda func, int n) {
     unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < n) {
@@ -8,7 +10,7 @@ template <typename Lambda> __global__ void launch_kernel(Lambda func, int n) {
 #define DISPATCH_START(n)                                                      \
     {                                                                          \
         const unsigned n_threads(n);                                           \
-        const unsigned block_size = 256;                                       \
+        const unsigned block_size = BLOCK_SIZE;                                \
         const unsigned grid_size = (n_threads + block_size - 1) / block_size;  \
     launch_kernel<<<grid_size, block_size>>>(
 #define DISPATCH_END , n_threads);                                             \
diff --git a/src/cpp/utility/utility.cu b/src/cpp/utility/utility.cu
index 3598de0..894ea15 100644
--- a/src/cpp/utility/utility.cu
+++ b/src/cpp/utility/utility.cu
@@ -14,8 +14,6 @@
 #include "eig-hpp/eigsolve2x2.hpp"
 #include "eig-hpp/eigsolve3x3.hpp"
 
-#define BLOCK_SIZE 256
-
 namespace utility {
 
 __device__ Vec3f compute_vertex_normal(const DataSet &data,