From fc5b58de675f2dd2bb40b1f7802083c723634c11 Mon Sep 17 00:00:00 2001 From: Ryoichi Ando Date: Mon, 13 Jan 2025 15:05:05 +0900 Subject: [PATCH] use consistent threads per block 256 --- src/cpp/common.hpp | 1 + src/cpp/csrmat/csrmat.cu | 20 +++++--------------- src/cpp/utility/dispatcher.cu | 4 +++- src/cpp/utility/utility.cu | 2 -- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/src/cpp/common.hpp b/src/cpp/common.hpp index 9fbde46..076d5c9 100644 --- a/src/cpp/common.hpp +++ b/src/cpp/common.hpp @@ -30,6 +30,7 @@ #define FLT_MIN -1.0e8f #define DT_MIN 1e-5f #define PI 3.14159265358979323846f +#define BLOCK_SIZE 256 #define DEBUG_MODE 0 namespace logging { diff --git a/src/cpp/csrmat/csrmat.cu b/src/cpp/csrmat/csrmat.cu index 73445ba..b628735 100644 --- a/src/cpp/csrmat/csrmat.cu +++ b/src/cpp/csrmat/csrmat.cu @@ -10,8 +10,6 @@ #include "../utility/utility.hpp" #include "csrmat.hpp" -#define BLOCK_SIZE 1024 - __device__ void Row::alloc() { head = 0; ref_head = 0; @@ -185,23 +183,15 @@ __global__ void add_block_offsets_kernel(unsigned *d_data, } unsigned exclusive_scan(unsigned *d_data, unsigned n) { - const unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; - const unsigned extra_scale = 2; + static unsigned _n = n; + assert(_n == n); + static unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; static unsigned *d_block_sums = nullptr; static unsigned *h_block_sums = nullptr; - static unsigned max_num_blocks = 0; if (d_block_sums == nullptr) { - max_num_blocks = extra_scale * num_blocks; - CUDA_HANDLE_ERROR(cudaMalloc((void **)&d_block_sums, - max_num_blocks * sizeof(unsigned))); - h_block_sums = new unsigned[max_num_blocks]; - } else if (max_num_blocks < num_blocks) { - max_num_blocks = extra_scale * num_blocks; - CUDA_HANDLE_ERROR(cudaFree(d_block_sums)); CUDA_HANDLE_ERROR(cudaMalloc((void **)&d_block_sums, - max_num_blocks * sizeof(unsigned))); - delete[] h_block_sums; - h_block_sums = new unsigned[max_num_blocks]; + num_blocks * sizeof(unsigned))); + h_block_sums = new unsigned[num_blocks]; } block_scan_kernel<<>>(d_data, d_block_sums, diff --git a/src/cpp/utility/dispatcher.cu b/src/cpp/utility/dispatcher.cu index 9c2f407..aa34fe5 100644 --- a/src/cpp/utility/dispatcher.cu +++ b/src/cpp/utility/dispatcher.cu @@ -1,3 +1,5 @@ +#include "../common.hpp" + template __global__ void launch_kernel(Lambda func, int n) { unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { @@ -8,7 +10,7 @@ template __global__ void launch_kernel(Lambda func, int n) { #define DISPATCH_START(n) \ { \ const unsigned n_threads(n); \ - const unsigned block_size = 256; \ + const unsigned block_size = BLOCK_SIZE; \ const unsigned grid_size = (n_threads + block_size - 1) / block_size; \ launch_kernel<<>>( #define DISPATCH_END , n_threads); \ diff --git a/src/cpp/utility/utility.cu b/src/cpp/utility/utility.cu index 3598de0..894ea15 100644 --- a/src/cpp/utility/utility.cu +++ b/src/cpp/utility/utility.cu @@ -14,8 +14,6 @@ #include "eig-hpp/eigsolve2x2.hpp" #include "eig-hpp/eigsolve3x3.hpp" -#define BLOCK_SIZE 256 - namespace utility { __device__ Vec3f compute_vertex_normal(const DataSet &data,