diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index ce4dc823b98..51671d1386d 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -82,7 +82,8 @@ template __global__ __launch_bounds__(default_block_size) void symbolic_count( IndexType num_rows, const IndexType* row_ptrs, const IndexType* lower_ends, const IndexType* inv_postorder, const IndexType* postorder_cols, - const IndexType* postorder_parent, IndexType* row_nnz) + const IndexType* postorder_parent, IndexType* row_nnz, + IndexType* path_lengths) { const auto row = thread::get_subwarp_id_flat(); if (row >= num_rows) { @@ -99,13 +100,54 @@ __global__ __launch_bounds__(default_block_size) void symbolic_count( const auto lane = subwarp.thread_rank(); IndexType count{}; for (auto nz = row_begin + lane; nz < lower_end; nz += subwarp_size) { + IndexType local_count{}; auto node = postorder_cols[nz]; const auto next_node = nz < lower_end - 1 ? postorder_cols[nz + 1] : diag_postorder; while (node < next_node) { - count++; + local_count++; node = postorder_parent[node]; } + path_lengths[nz] = count; + count += local_count; + } + // lower entries plus diagonal + count = reduce(subwarp, count, thrust::plus{}) + 1; + if (lane == 0) { + row_nnz[row] = count; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void symbolic_count_lca( + IndexType num_rows, const IndexType* row_ptrs, const IndexType* lower_ends, + const IndexType* inv_postorder, const IndexType* postorder_cols, + const IndexType* levels, const IndexType* euler_first, + const range_minimum_query lca_rmq, IndexType* row_nnz, + IndexType* path_lengths) +{ + const auto row = thread::get_subwarp_id_flat(); + if (row >= num_rows) { + return; + } + const auto row_begin = row_ptrs[row]; + const auto level = lca_rmq.min(euler_first[row]); + const auto lower_end = lower_ends[row]; + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + const auto lane = subwarp.thread_rank(); + IndexType count{}; + for (auto nz = row_begin + lane; nz < lower_end; nz += subwarp_size) { + const auto node = postorder_cols[nz]; + const auto euler_pos = euler_first[node]; + const auto next_node = + nz < lower_end - 1 ? postorder_cols[nz + 1] : diag_postorder; + const auto next_euler_pos = euler_first[next_node]; + const auto lca_level = lca_rmq.query(euler_pos, next_euler_pos).min; + const auto path_length = level - lca_level; + path_lengths[nz] = path_length; + count += path_length; } // lower entries plus diagonal count = reduce(subwarp, count, thrust::plus{}) + 1; diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index fa87edd7fe3..7f19e842a82 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -28,6 +28,15 @@ namespace kernels { IndexType* row_nnz, array& tmp_storage) +#define GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT_LCA(ValueType, IndexType) \ + void symbolic_count_lca( \ + std::shared_ptr exec, \ + const matrix::Csr* mtx, \ + const gko::factorization::elimination_forest& forest, \ + const range_minimum_query& lca_rmq, IndexType* row_nnz, \ + array& tmp_storage) + + #define GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE(ValueType, IndexType) \ void symbolic_factorize( \ std::shared_ptr exec, \ @@ -62,6 +71,8 @@ namespace kernels { template \ GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType); \ template \ + GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT_LCA(ValueType, IndexType); \ + template \ GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE(ValueType, IndexType); \ template \ GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType); \