Skip to content

Commit

Permalink
use consistent threads per block 256
Browse files Browse the repository at this point in the history
  • Loading branch information
ryichando committed Jan 13, 2025
1 parent 2015b78 commit fc5b58d
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 18 deletions.
1 change: 1 addition & 0 deletions src/cpp/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#define FLT_MIN -1.0e8f
#define DT_MIN 1e-5f
#define PI 3.14159265358979323846f
#define BLOCK_SIZE 256
#define DEBUG_MODE 0

namespace logging {
Expand Down
20 changes: 5 additions & 15 deletions src/cpp/csrmat/csrmat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
#include "../utility/utility.hpp"
#include "csrmat.hpp"

#define BLOCK_SIZE 1024

__device__ void Row::alloc() {
head = 0;
ref_head = 0;
Expand Down Expand Up @@ -185,23 +183,15 @@ __global__ void add_block_offsets_kernel(unsigned *d_data,
}

unsigned exclusive_scan(unsigned *d_data, unsigned n) {
const unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
const unsigned extra_scale = 2;
static unsigned _n = n;
assert(_n == n);
static unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
static unsigned *d_block_sums = nullptr;
static unsigned *h_block_sums = nullptr;
static unsigned max_num_blocks = 0;
if (d_block_sums == nullptr) {
max_num_blocks = extra_scale * num_blocks;
CUDA_HANDLE_ERROR(cudaMalloc((void **)&d_block_sums,
max_num_blocks * sizeof(unsigned)));
h_block_sums = new unsigned[max_num_blocks];
} else if (max_num_blocks < num_blocks) {
max_num_blocks = extra_scale * num_blocks;
CUDA_HANDLE_ERROR(cudaFree(d_block_sums));
CUDA_HANDLE_ERROR(cudaMalloc((void **)&d_block_sums,
max_num_blocks * sizeof(unsigned)));
delete[] h_block_sums;
h_block_sums = new unsigned[max_num_blocks];
num_blocks * sizeof(unsigned)));
h_block_sums = new unsigned[num_blocks];
}
block_scan_kernel<<<num_blocks, BLOCK_SIZE,
BLOCK_SIZE * sizeof(unsigned)>>>(d_data, d_block_sums,
Expand Down
4 changes: 3 additions & 1 deletion src/cpp/utility/dispatcher.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "../common.hpp"

template <typename Lambda> __global__ void launch_kernel(Lambda func, int n) {
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
Expand All @@ -8,7 +10,7 @@ template <typename Lambda> __global__ void launch_kernel(Lambda func, int n) {
#define DISPATCH_START(n) \
{ \
const unsigned n_threads(n); \
const unsigned block_size = 256; \
const unsigned block_size = BLOCK_SIZE; \
const unsigned grid_size = (n_threads + block_size - 1) / block_size; \
launch_kernel<<<grid_size, block_size>>>(
#define DISPATCH_END , n_threads); \
Expand Down
2 changes: 0 additions & 2 deletions src/cpp/utility/utility.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
#include "eig-hpp/eigsolve2x2.hpp"
#include "eig-hpp/eigsolve3x3.hpp"

#define BLOCK_SIZE 256

namespace utility {

__device__ Vec3f compute_vertex_normal(const DataSet &data,
Expand Down

0 comments on commit fc5b58d

Please sign in to comment.