From 9bd3df73a7baa5a748915d5a638521f57c397e89 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 13 Jun 2024 14:50:11 -0600 Subject: [PATCH 01/41] Some cleanup and refactoring --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 89 ++++++++----- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 45 +++---- sparse/src/KokkosKernels_Handle.hpp | 5 +- sparse/src/KokkosSparse_sptrsv_handle.hpp | 118 +++++++++--------- 4 files changed, 140 insertions(+), 117 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index a64a4d23bc..35188ed52c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -52,6 +52,25 @@ namespace KokkosSparse { namespace Impl { namespace Experimental { +template +struct SptrsvWrap { + + // + // Useful types + // + using execution_space = typename TriSolveHandle::execution_space; + using memory_space = typename TriSolveHandle::memory_space; + using lno_t = typename TriSolveHandle::nnz_lno_t; + using size_type = typename TriSolveHandle::size_type; + using scalar_t = typename TriSolveHandle::scalar_t; + using row_map_t = typename TriSolveHandle::nnz_row_view_t; + using entries_t = typename TriSolveHandle::nnz_lno_view_t; + using values_t = typename TriSolveHandle::nnz_scalar_view_t; + using karith = typename Kokkos::ArithTraits; + using team_policy = typename TriSolveHandle::TeamPolicy; + using member_type = typename team_policy::member_type; + using range_policy = typename TriSolveHandle::RangePolicy; + #if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \ defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH) #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT @@ -64,7 +83,7 @@ struct LargerCutoffTag {}; struct UnsortedLargerCutoffTag {}; template -void print_view1d_solve(const ViewType dv, size_t range = 0) { +static void print_view1d_solve(const ViewType dv, size_t range = 0) { auto v = Kokkos::create_mirror_view(dv); Kokkos::deep_copy(v, dv); std::cout << "Output for view " << v.label() << std::endl; @@ -662,7 +681,7 @@ struct LowerTriLvlSchedTP2SolverFunctor { #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) // ----------------------------------------------------------- // Helper functors for Lower-triangular solve with SpMV -template +template struct SparseTriSupernodalSpMVFunctor { using execution_space = typename TriSolveHandle::HandleExecSpace; using memory_space = typename TriSolveHandle::HandleTempMemorySpace; @@ -743,7 +762,7 @@ struct SparseTriSupernodalSpMVFunctor { // ----------------------------------------------------------- // Functor for Lower-triangular solve -template struct LowerTriSupernodalFunctor { using execution_space = typename TriSolveHandle::HandleExecSpace; @@ -950,7 +969,7 @@ struct LowerTriSupernodalFunctor { // ----------------------------------------------------------- // Functor for Upper-triangular solve in CSR -template struct UpperTriSupernodalFunctor { using execution_space = typename TriSolveHandle::HandleExecSpace; @@ -1120,7 +1139,7 @@ struct UpperTriSupernodalFunctor { // ----------------------------------------------------------- // Functor for Upper-triangular solve in CSC -template struct UpperTriTranSupernodalFunctor { using execution_space = typename TriSolveHandle::HandleExecSpace; @@ -2746,9 +2765,9 @@ struct ReturnRangePolicyType { }; #endif -template -void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, +static void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -2817,9 +2836,9 @@ void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, Kokkos::fence(); } // end lower_tri_solve_cg -template -void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, +static void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -2889,9 +2908,9 @@ void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, #endif -template -void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, +static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { @@ -3084,7 +3103,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, if (invert_diagonal && !invert_offdiagonal) { // copy diagonals to workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); Kokkos::parallel_for( @@ -3175,7 +3194,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, if (invert_offdiagonal) { // copy diagonals from workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); Kokkos::parallel_for( @@ -3188,7 +3207,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, } // launching sparse-triangular solve functor - LowerTriSupernodalFunctor sptrsv_functor(unit_diagonal, invert_diagonal, invert_offdiagonal, supercols, row_map, entries, values, lvl, @@ -3231,7 +3250,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, auto digmat = thandle.get_diagblock(lvl); KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( @@ -3243,7 +3262,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( @@ -3259,7 +3278,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( @@ -3300,9 +3319,9 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, } // end lower_tri_solve -template -void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, +static void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { @@ -3492,7 +3511,7 @@ tstf); } // end elseif if (invert_diagonal && !invert_offdiagonal) { // copy diagonals to workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); Kokkos::parallel_for( @@ -3586,7 +3605,7 @@ tstf); } // end elseif if (invert_offdiagonal) { // copy diagonals from workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); Kokkos::parallel_for( @@ -3599,7 +3618,7 @@ tstf); } // end elseif } // launching sparse-triangular solve functor - UpperTriTranSupernodalFunctor sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols, row_map, entries, values, lvl, kernel_type, @@ -3615,7 +3634,7 @@ tstf); } // end elseif sptrsv_functor); } else { // U stored in CSR // launching sparse-triangular solve functor - UpperTriSupernodalFunctor sptrsv_functor(invert_diagonal, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, @@ -3704,7 +3723,7 @@ tstf); } // end elseif if (invert_diagonal) { // copy diagonals from workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); Kokkos::parallel_for( @@ -3746,7 +3765,7 @@ tstf); } // end elseif auto digmat = thandle.get_diagblock(lvl); KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( @@ -3758,7 +3777,7 @@ tstf); } // end elseif } else { // zero out lhs corresponding to diagonal blocks in lhs, and copy to // work - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( @@ -3776,7 +3795,7 @@ tstf); } // end elseif if (!invert_offdiagonal) { // zero out lhs corresponding to diagonal blocks in lhs, and copy to // work - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( @@ -3799,7 +3818,7 @@ tstf); } // end elseif } } // reinitialize workspace - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( @@ -3837,9 +3856,9 @@ tstf); } // end elseif } // end upper_tri_solve -template -void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, +static void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs, const bool /*is_lowertri_*/) { @@ -4122,9 +4141,9 @@ void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, // Stream interfaces // -------------------------------- -template -void lower_tri_solve_streams(const std::vector &execspace_v, +static void lower_tri_solve_streams(const std::vector &execspace_v, const std::vector &thandle_v, const std::vector &row_map_v, const std::vector &entries_v, @@ -4208,9 +4227,9 @@ void lower_tri_solve_streams(const std::vector &execspace_v, } // end for lvl } // end lower_tri_solve_streams -template -void upper_tri_solve_streams(const std::vector &execspace_v, +static void upper_tri_solve_streams(const std::vector &execspace_v, const std::vector &thandle_v, const std::vector &row_map_v, const std::vector &entries_v, @@ -4294,6 +4313,8 @@ void upper_tri_solve_streams(const std::vector &execspace_v, } // end for lvl } // end upper_tri_solve_streams +}; // struct SptrsvWrap + } // namespace Experimental } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 6ad321c286..ecdbdd3d73 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -120,6 +120,8 @@ struct SPTRSV_SOLVE; + // Call specific algorithm type auto sptrsv_handle = handle->get_sptrsv_handle(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() @@ -127,45 +129,43 @@ struct SPTRSV_SOLVEis_lower_tri()) { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, - entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, - values, b, x, true); + Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, true); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) // TODO: set stream in thandle's sptrsvCudaGraph - Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, + values, b, x); else #endif - Experimental::lower_tri_solve(space, *sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::lower_tri_solve(space, *sptrsv_handle, row_map, entries, + values, b, x); } } else { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, - entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, - values, b, x, false); + Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, false); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) // TODO: set stream in thandle's sptrsvCudaGraph - Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, + values, b, x); else #endif - Experimental::upper_tri_solve(space, *sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::upper_tri_solve(space, *sptrsv_handle, row_map, entries, + values, b, x); } } Kokkos::Profiling::popRegion(); @@ -178,6 +178,7 @@ struct SPTRSV_SOLVE &entries_v, const std::vector &values_v, const std::vector &b_v, std::vector &x_v) { + using Sptrsv = Experimental::SptrsvWrap; // Call specific algorithm type // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment // Assume streams have the same either lower or upper matrix type @@ -197,9 +198,9 @@ struct SPTRSV_SOLVE(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { @@ -208,9 +209,9 @@ struct SPTRSV_SOLVEsptrsvHandle; } void create_sptrsv_handle(KokkosSparse::Experimental::SPTRSVAlgorithm algm, - size_type nrows, bool lower_tri) { + size_type nrows, bool lower_tri, + size_type block_size = 0) { this->destroy_sptrsv_handle(); this->is_owner_of_the_sptrsv_handle = true; - this->sptrsvHandle = new SPTRSVHandleType(algm, nrows, lower_tri); + this->sptrsvHandle = new SPTRSVHandleType(algm, nrows, lower_tri, block_size); // this->sptrsvHandle->init_handle(nrows); this->sptrsvHandle->set_team_size(this->team_work_size); this->sptrsvHandle->set_vector_size(this->vector_size); diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index cf23bfdc1f..caa07ab07d 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -56,76 +56,65 @@ template class SPTRSVHandle { public: - typedef ExecutionSpace HandleExecSpace; - typedef TemporaryMemorySpace HandleTempMemorySpace; - typedef PersistentMemorySpace HandlePersistentMemorySpace; - - typedef ExecutionSpace execution_space; - typedef HandlePersistentMemorySpace memory_space; - - typedef typename std::remove_const::type size_type; - typedef const size_type const_size_type; - - typedef typename std::remove_const::type nnz_lno_t; - typedef const nnz_lno_t const_nnz_lno_t; - - typedef typename std::remove_const::type scalar_t; - typedef const scalar_t const_nnz_scalar_t; - - // row_map type (managed memory) - typedef typename Kokkos::View - nnz_row_view_temp_t; - typedef typename Kokkos::View - nnz_row_view_t; - typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t; - typedef typename Kokkos::View - int_row_view_t; - typedef typename Kokkos::View - int64_row_view_t; + using HandleExecSpace = ExecutionSpace; + using HandleTempMemorySpace = TemporaryMemorySpace; + using HandlePersistentMemorySpace = PersistentMemorySpace; + + using execution_space = ExecutionSpace; + using memory_space = HandlePersistentMemorySpace; + + using TeamPolicy = Kokkos::TeamPolicy; + using RangePolicy = Kokkos::RangePolicy; + + using size_type = typename std::remove_const::type; + using const_size_type = const size_type; + + using nnz_lno_t = typename std::remove_const::type; + using const_nnz_lno_t = const nnz_lno_t; + + using scalar_t = typename std::remove_const::type; + using const_nnz_scalar_t = const scalar_t; + + // Row_map type (managed memory) + using nnz_row_view_temp_t = typename Kokkos::View; + using nnz_row_view_t = typename Kokkos::View; + using host_nnz_row_view_t = typename nnz_row_view_t::HostMirror; + using int_row_view_t = typename Kokkos::View; + using int64_row_view_t = typename Kokkos::View; // typedef typename row_lno_persistent_work_view_t::HostMirror // row_lno_persistent_work_host_view_t; //Host view type - typedef typename Kokkos::View< + using nnz_row_unmanaged_view_t = typename Kokkos::View< const size_type *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits> - nnz_row_unmanaged_view_t; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // values type (managed memory) - typedef typename Kokkos::View - nnz_scalar_view_temp_t; - typedef typename Kokkos::View - nnz_scalar_view_t; - typedef typename nnz_scalar_view_t::HostMirror host_nnz_scalar_view_t; - typedef typename Kokkos::View< + using nnz_scalar_view_temp_t = typename Kokkos::View; + using nnz_scalar_view_t = typename Kokkos::View; + using host_nnz_scalar_view_t = typename nnz_scalar_view_t::HostMirror; + using nnz_scalar_unmanaged_view_t = typename Kokkos::View< const scalar_t *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits> - nnz_scalar_unmanaged_view_t; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // entries type (managed memory) - typedef typename Kokkos::View - nnz_lno_view_temp_t; - typedef typename Kokkos::View - nnz_lno_view_t; - typedef typename Kokkos::View - hostspace_nnz_lno_view_t; - typedef typename nnz_lno_view_t::HostMirror host_nnz_lno_view_t; - typedef typename Kokkos::View< + using nnz_lno_view_temp_t = typename Kokkos::View; + using nnz_lno_view_t = typename Kokkos::View; + using hostspace_nnz_lno_view_t = typename Kokkos::View; + using host_nnz_lno_view_t = typename nnz_lno_view_t::HostMirror; + using nnz_lno_unmanaged_view_t = typename Kokkos::View< const nnz_lno_t *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits> - nnz_lno_unmanaged_view_t; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // typedef typename nnz_lno_persistent_work_view_t::HostMirror // nnz_lno_persistent_work_host_view_t; //Host view type - typedef typename std::make_signed< - typename nnz_row_view_t::non_const_value_type>::type signed_integral_t; - typedef Kokkos::View - signed_nnz_lno_view_t; - typedef typename signed_nnz_lno_view_t::HostMirror host_signed_nnz_lno_view_t; + using signed_integral_t = typename std::make_signed::type; + using signed_nnz_lno_view_t = Kokkos::View; - typedef typename Kokkos::View - mtx_scalar_view_t; + using host_signed_nnz_lno_view_t = typename signed_nnz_lno_view_t::HostMirror; + + using mtx_scalar_view_t = typename Kokkos::View; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #if (CUDA_VERSION >= 11030) @@ -214,7 +203,7 @@ class SPTRSVHandle { }; #endif - typedef cuSparseHandleType SPTRSVcuSparseHandleType; + using SPTRSVcuSparseHandleType = cuSparseHandleType; #endif #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT @@ -228,7 +217,7 @@ class SPTRSVHandle { //~cudaGraphWrapperType() { } }; - typedef cudaGraphWrapperType SPTRSVcudaGraphWrapperType; + using SPTRSVcudaGraphWrapperType = cudaGraphWrapperType; void create_SPTRSVcudaGraphWrapperType() { destroy_SPTRSVcudaGraphWrapperType(); @@ -296,6 +285,7 @@ class SPTRSVHandle { nnz_lno_view_t nodes_grouped_by_level; hostspace_nnz_lno_view_t hnodes_grouped_by_level; // NEW size_type nlevel; + size_type block_size; // block_size > 0 implies BSR int team_size; int vector_size; @@ -423,6 +413,7 @@ class SPTRSVHandle { public: SPTRSVHandle(SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, + const size_type block_size_ = 0, bool symbolic_complete_ = false, bool numeric_complete_ = false) : #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT @@ -438,6 +429,7 @@ class SPTRSVHandle { nodes_grouped_by_level(), hnodes_grouped_by_level(), nlevel(0), + block_size(block_size_), team_size(-1), vector_size(-1), stored_diagonal(false), @@ -1007,6 +999,14 @@ class SPTRSVHandle { void set_num_levels(size_type nlevels_) { this->nlevel = nlevels_; } + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + + KOKKOS_INLINE_FUNCTION + void set_block_size(const size_type block_size_) { + this->block_size = block_size_; + } + void set_symbolic_complete() { this->symbolic_complete = true; } void set_symbolic_incomplete() { this->symbolic_complete = false; } From 21e810f5c584dc83be2d1d4b08e1470d37089e25 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 14 Jun 2024 11:13:34 -0600 Subject: [PATCH 02/41] First round of cleanup complete --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 2 - .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 5025 ++++++++--------- 2 files changed, 2366 insertions(+), 2661 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 415ccf87a0..32fa122196 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -52,8 +52,6 @@ struct IlukWrap { using lno_t = typename IlukHandle::nnz_lno_t; using size_type = typename IlukHandle::size_type; using scalar_t = typename IlukHandle::nnz_scalar_t; - using HandleDeviceRowMapType = typename IlukHandle::nnz_row_view_t; - using HandleDeviceValueType = typename IlukHandle::nnz_value_view_t; using WorkViewType = typename IlukHandle::work_view_t; using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; using LevelViewType = typename IlukHandle::nnz_lno_view_t; diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 35188ed52c..1e386f43a4 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -27,15 +27,11 @@ #include #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV - // Enable supernodal sptrsv #include "KokkosBlas3_trsm.hpp" #include "KokkosSparse_spmv.hpp" - #include "KokkosBatched_Util.hpp" - #include "KokkosBlas2_team_gemv_spec.hpp" - #include "KokkosBatched_Trsm_Team_Impl.hpp" #endif @@ -48,6 +44,11 @@ #include "cuda_profiler_api.h" #endif +#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \ + defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH) +#define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT +#endif + namespace KokkosSparse { namespace Impl { namespace Experimental { @@ -58,77 +59,71 @@ struct SptrsvWrap { // // Useful types // - using execution_space = typename TriSolveHandle::execution_space; - using memory_space = typename TriSolveHandle::memory_space; - using lno_t = typename TriSolveHandle::nnz_lno_t; - using size_type = typename TriSolveHandle::size_type; - using scalar_t = typename TriSolveHandle::scalar_t; - using row_map_t = typename TriSolveHandle::nnz_row_view_t; - using entries_t = typename TriSolveHandle::nnz_lno_view_t; - using values_t = typename TriSolveHandle::nnz_scalar_view_t; - using karith = typename Kokkos::ArithTraits; - using team_policy = typename TriSolveHandle::TeamPolicy; - using member_type = typename team_policy::member_type; - using range_policy = typename TriSolveHandle::RangePolicy; - -#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \ - defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH) -#define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT -#endif - -struct UnsortedTag {}; - -struct LargerCutoffTag {}; - -struct UnsortedLargerCutoffTag {}; - -template -static void print_view1d_solve(const ViewType dv, size_t range = 0) { - auto v = Kokkos::create_mirror_view(dv); - Kokkos::deep_copy(v, dv); - std::cout << "Output for view " << v.label() << std::endl; - range = range == 0 ? dv.extent(0) : range; - for (size_t i = 0; i < range; ++i) { - std::cout << "v(" << i << ") = " << v(i) << " , "; + using execution_space = typename TriSolveHandle::execution_space; + using memory_space = typename TriSolveHandle::memory_space; + using temp_mem_space = typename TriSolveHandle::HandleTempMemorySpace; + using lno_t = typename TriSolveHandle::nnz_lno_t; + using size_type = typename TriSolveHandle::size_type; + using scalar_t = typename TriSolveHandle::scalar_t; + using row_map_t = typename TriSolveHandle::nnz_row_view_t; + using entries_t = typename TriSolveHandle::nnz_lno_view_t; + using values_t = typename TriSolveHandle::nnz_scalar_view_t; + using work_view_t = Kokkos::View>; + using work_view_int_t = Kokkos::View>; + using karith = typename Kokkos::ArithTraits; + using team_policy = typename TriSolveHandle::TeamPolicy; + using member_type = typename team_policy::member_type; + using range_policy = typename TriSolveHandle::RangePolicy; + using range_type = Kokkos::pair; + + // Tag structs + struct UnsortedTag {}; + struct LargerCutoffTag {}; + struct UnsortedLargerCutoffTag {}; + + template + static void print_view1d_solve(const ViewType dv, size_t range = 0) { + auto v = Kokkos::create_mirror_view(dv); + Kokkos::deep_copy(v, dv); + std::cout << "Output for view " << v.label() << std::endl; + range = range == 0 ? dv.extent(0) : range; + for (size_t i = 0; i < range; ++i) { + std::cout << "v(" << i << ") = " << v(i) << " , "; + } + std::cout << std::endl; } - std::cout << std::endl; -} - -// Needed for cudagraphs -struct EmptyFunctor { - KOKKOS_INLINE_FUNCTION - void operator()(const int) const {} -}; - -// This functor unifies the lower and upper implementations, the hope is the -// "is_lowertri" check does not add noticable time on larger problems -template -struct TriLvlSchedTP1SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - const bool is_lowertri; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - - TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - const bool &is_lowertri_, const long &node_count_) + + // Needed for cudagraphs + struct EmptyFunctor { + KOKKOS_INLINE_FUNCTION + void operator()(const int) const {} + }; + + // This functor unifies the lower and upper implementations, the hope is the + // "is_lowertri" check does not add noticable time on larger problems + template + struct TriLvlSchedTP1SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + const bool is_lowertri; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + + TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const bool &is_lowertri_, const long &node_count_) : row_map(row_map_), entries(entries_), values(values_), @@ -138,18 +133,18 @@ struct TriLvlSchedTP1SolverFunctor { is_lowertri(is_lowertri_), node_count(node_count_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); @@ -161,31 +156,31 @@ struct TriLvlSchedTP1SolverFunctor { }, diff); - team.team_barrier(); + team.team_barrier(); - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) + : (rhs_rowid + diff) / values(soffset); + } } - } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - auto diag = -1; + auto diag = -1; - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); @@ -197,47 +192,41 @@ struct TriLvlSchedTP1SolverFunctor { } }, diff); - team.team_barrier(); + team.team_barrier(); - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } } - } -}; - -template -struct TriLvlSchedTP1SolverFunctorDiagValues { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - ValuesType diagonal_values; // inserted according to rowid - - const bool is_lowertri; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long dense_nrows; - - TriLvlSchedTP1SolverFunctorDiagValues(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, - LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - const ValuesType &diagonal_values_, - const bool is_lowertri_, - long node_count_, long dense_nrows_ = 0) + }; + + template + struct TriLvlSchedTP1SolverFunctorDiagValues { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + ValuesType diagonal_values; // inserted according to rowid + + const bool is_lowertri; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long dense_nrows; + + TriLvlSchedTP1SolverFunctorDiagValues(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, + LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const ValuesType &diagonal_values_, + const bool is_lowertri_, + long node_count_, long dense_nrows_ = 0) : row_map(row_map_), entries(entries_), values(values_), @@ -249,18 +238,18 @@ struct TriLvlSchedTP1SolverFunctorDiagValues { node_count(node_count_), dense_nrows(dense_nrows_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); @@ -271,47 +260,41 @@ struct TriLvlSchedTP1SolverFunctorDiagValues { }, diff); - team.team_barrier(); + team.team_barrier(); - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : - // (rhs_rowid+diff)/values(soffset); - lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid); + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : + // (rhs_rowid+diff)/values(soffset); + lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid); + } } - } -}; - -template -struct TriLvlSchedTP2SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - const bool is_lowertri; - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - long dense_nrows; - - TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - const bool is_lowertri_, long node_count_, - long node_groups_ = 0, long dense_nrows_ = 0) + }; + + template + struct TriLvlSchedTP2SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + const bool is_lowertri; + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + long dense_nrows; + + TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const bool is_lowertri_, long node_count_, + long node_groups_ = 0, long dense_nrows_ = 0) : row_map(row_map_), entries(entries_), values(values_), @@ -323,13 +306,13 @@ struct TriLvlSchedTP2SolverFunctor { node_groups(node_groups_), dense_nrows(dense_nrows_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid - size_t nrows = row_map.extent(0) - 1; + size_t nrows = row_map.extent(0) - 1; - Kokkos::parallel_for( + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { auto rowid = nodes_grouped_by_level(node_count + my_league * node_groups + ng); @@ -356,16 +339,16 @@ struct TriLvlSchedTP2SolverFunctor { } // end if }); // end TeamThreadRange - team.team_barrier(); - } + team.team_barrier(); + } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid - size_t nrows = row_map.extent(0) - 1; + size_t nrows = row_map.extent(0) - 1; - Kokkos::parallel_for( + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { auto rowid = nodes_grouped_by_level(node_count + my_league * node_groups + ng); @@ -393,28 +376,27 @@ struct TriLvlSchedTP2SolverFunctor { } // end if }); // end TeamThreadRange - team.team_barrier(); - } -}; - -// Lower vs Upper Multi-block Functors - -template -struct LowerTriLvlSchedRPSolverFunctor { - typedef typename EntriesType::non_const_value_type lno_t; - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_) + team.team_barrier(); + } + }; + + // Lower vs Upper Multi-block Functors + + template + struct LowerTriLvlSchedRPSolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_) : row_map(row_map_), entries(entries_), values(values_), @@ -422,74 +404,68 @@ struct LowerTriLvlSchedRPSolverFunctor { rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - // Assuming indices are sorted per row, diag entry is final index in the - // list - - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - - for (long ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - lhs(rowid) = rhs_rowid / val; - } - } // end for ptr - } + KOKKOS_INLINE_FUNCTION + void operator()(const lno_t i) const { + auto rowid = nodes_grouped_by_level(i); + // Assuming indices are sorted per row, diag entry is final index in the + // list + + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + + for (long ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + rhs_rowid = rhs_rowid - val * lhs(colid); + } else { + lhs(rowid) = rhs_rowid / val; + } + } // end for ptr + } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - auto diag = -1; - - for (long ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - diag = ptr; - } - } // end for ptr - lhs(rowid) = rhs_rowid / values(diag); - } -}; - -template -struct LowerTriLvlSchedTP1SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const lno_t i) const { + auto rowid = nodes_grouped_by_level(i); + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + auto diag = -1; + + for (long ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + rhs_rowid = rhs_rowid - val * lhs(colid); + } else { + diag = ptr; + } + } // end for ptr + lhs(rowid) = rhs_rowid / values(diag); + } + }; + + template + struct LowerTriLvlSchedTP1SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + + LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + long node_count_, long node_groups_ = 0) : row_map(row_map_), entries(entries_), values(values_), @@ -499,18 +475,18 @@ struct LowerTriLvlSchedTP1SolverFunctor { node_count(node_count_), node_groups(node_groups_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); @@ -521,30 +497,30 @@ struct LowerTriLvlSchedTP1SolverFunctor { }, diff); - team.team_barrier(); + team.team_barrier(); - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); + } } - } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - auto diag = -1; + auto diag = -1; - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); @@ -556,45 +532,39 @@ struct LowerTriLvlSchedTP1SolverFunctor { } }, diff); - team.team_barrier(); + team.team_barrier(); - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } } - } -}; - -// FIXME CUDA: This algorithm not working with all integral type combos -// In any case, this serves as a skeleton for 3-level hierarchical parallelism -// for alg dev -template -struct LowerTriLvlSchedTP2SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) + }; + + // FIXME CUDA: This algorithm not working with all integral type combos + // In any case, this serves as a skeleton for 3-level hierarchical parallelism + // for alg dev + template + struct LowerTriLvlSchedTP2SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + + LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + long node_count_, long node_groups_ = 0) : row_map(row_map_), entries(entries_), values(values_), @@ -604,13 +574,13 @@ struct LowerTriLvlSchedTP2SolverFunctor { node_count(node_count_), node_groups(node_groups_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid - size_t nrows = row_map.extent(0) - 1; + size_t nrows = row_map.extent(0) - 1; - Kokkos::parallel_for( + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { auto rowid = nodes_grouped_by_level(node_count + my_league * node_groups + ng); @@ -636,16 +606,16 @@ struct LowerTriLvlSchedTP2SolverFunctor { } // end if }); // end TeamThreadRange - team.team_barrier(); - } + team.team_barrier(); + } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid - size_t nrows = row_map.extent(0) - 1; + size_t nrows = row_map.extent(0) - 1; - Kokkos::parallel_for( + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { auto rowid = nodes_grouped_by_level(node_count + my_league * node_groups + ng); @@ -674,42 +644,30 @@ struct LowerTriLvlSchedTP2SolverFunctor { } // end if }); // end TeamThreadRange - team.team_barrier(); - } -}; + team.team_barrier(); + } + }; #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) -// ----------------------------------------------------------- -// Helper functors for Lower-triangular solve with SpMV -template -struct SparseTriSupernodalSpMVFunctor { - using execution_space = typename TriSolveHandle::HandleExecSpace; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - using scalar_t = typename LHSType::non_const_value_type; - - using work_view_t = - typename Kokkos::View>; - - int flag; - long node_count; - NGBLType nodes_grouped_by_level; - - const int *supercols; - const int *workoffset; - - LHSType X; - work_view_t work; - - // constructor - SparseTriSupernodalSpMVFunctor(int flag_, long node_count_, - const NGBLType &nodes_grouped_by_level_, - const int *supercols_, const int *workoffset_, - LHSType &X_, work_view_t work_) + // ----------------------------------------------------------- + // Helper functors for Lower-triangular solve with SpMV + template + struct SparseTriSupernodalSpMVFunctor { + int flag; + long node_count; + entries_t nodes_grouped_by_level; + + const int *supercols; + const int *workoffset; + + LHSType X; + work_view_t work; + + // constructor + SparseTriSupernodalSpMVFunctor(int flag_, long node_count_, + const entries_t &nodes_grouped_by_level_, + const int *supercols_, const int *workoffset_, + LHSType &X_, work_view_t work_) : flag(flag_), node_count(node_count_), nodes_grouped_by_level(nodes_grouped_by_level_), @@ -718,105 +676,90 @@ struct SparseTriSupernodalSpMVFunctor { X(X_), work(work_) {} - // operator - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - const int league_rank = team.league_rank(); // batch id - const int team_size = team.team_size(); - const int team_rank = team.team_rank(); - const scalar_t zero(0.0); - - auto s = nodes_grouped_by_level(node_count + league_rank); - - // copy vector elements for the diagonal to input vector (work) - // and zero out the corresponding elements in output (X) - int w1 = workoffset[s]; - int j1 = supercols[s]; - // number of columns in the s-th supernode column - int nscol = supercols[s + 1] - j1; - - if (flag == -2) { - // copy X to work - for (int j = team_rank; j < nscol; j += team_size) { - work(w1 + j) = X(j1 + j); - } - } else if (flag == -1) { - // copy work to X - for (int j = team_rank; j < nscol; j += team_size) { - X(j1 + j) = work(w1 + j); - } - } else if (flag == 1) { - for (int j = team_rank; j < nscol; j += team_size) { - work(w1 + j) = X(j1 + j); - X(j1 + j) = zero; - } - } else { - // reinitialize work to zero - for (int j = team_rank; j < nscol; j += team_size) { - work(w1 + j) = zero; + // operator + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + const int league_rank = team.league_rank(); // batch id + const int team_size = team.team_size(); + const int team_rank = team.team_rank(); + const scalar_t zero(0.0); + + auto s = nodes_grouped_by_level(node_count + league_rank); + + // copy vector elements for the diagonal to input vector (work) + // and zero out the corresponding elements in output (X) + int w1 = workoffset[s]; + int j1 = supercols[s]; + // number of columns in the s-th supernode column + int nscol = supercols[s + 1] - j1; + + if (flag == -2) { + // copy X to work + for (int j = team_rank; j < nscol; j += team_size) { + work(w1 + j) = X(j1 + j); + } + } else if (flag == -1) { + // copy work to X + for (int j = team_rank; j < nscol; j += team_size) { + X(j1 + j) = work(w1 + j); + } + } else if (flag == 1) { + for (int j = team_rank; j < nscol; j += team_size) { + work(w1 + j) = X(j1 + j); + X(j1 + j) = zero; + } + } else { + // reinitialize work to zero + for (int j = team_rank; j < nscol; j += team_size) { + work(w1 + j) = zero; + } } + team.team_barrier(); } - team.team_barrier(); - } -}; - -// ----------------------------------------------------------- -// Functor for Lower-triangular solve -template -struct LowerTriSupernodalFunctor { - using execution_space = typename TriSolveHandle::HandleExecSpace; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - using scalar_t = typename ValuesType::non_const_value_type; - - using integer_view_t = Kokkos::View; - using work_view_t = - typename Kokkos::View>; - - using range_type = Kokkos::pair; + }; - const bool unit_diagonal; - const bool invert_diagonal; - const bool invert_offdiagonal; - const int *supercols; - ColptrView colptr; - RowindType rowind; - ValuesType values; + // ----------------------------------------------------------- + // Functor for Lower-triangular solve + template + struct LowerTriSupernodalFunctor { + const bool unit_diagonal; + const bool invert_diagonal; + const bool invert_offdiagonal; + const int *supercols; + ColptrView colptr; + RowindType rowind; + ValuesType values; - int level; - integer_view_t kernel_type; - integer_view_t diag_kernel_type; + int level; + work_view_int_t kernel_type; + work_view_int_t diag_kernel_type; - LHSType X; + LHSType X; - work_view_t work; // needed with gemv for update&scatter - integer_view_t work_offset; + work_view_t work; // needed with gemv for update&scatter + work_view_int_t work_offset; - NGBLType nodes_grouped_by_level; + entries_t nodes_grouped_by_level; - long node_count; + long node_count; - // constructor - LowerTriSupernodalFunctor( // supernode info + // constructor + LowerTriSupernodalFunctor( // supernode info const bool unit_diagonal_, const bool invert_diagonal_, const bool invert_offdiagonal_, const int *supercols_, // L in CSC const ColptrView &colptr_, const RowindType &rowind_, const ValuesType &values_, // options to pick kernel type - int level_, integer_view_t &kernel_type_, - integer_view_t &diag_kernel_type_, + int level_, work_view_int_t &kernel_type_, + work_view_int_t &diag_kernel_type_, // right-hand-side (input), solution (output) LHSType &X_, // workspace - work_view_t work_, integer_view_t &work_offset_, + work_view_t work_, work_view_int_t &work_offset_, // - const NGBLType &nodes_grouped_by_level_, long node_count_) + const entries_t &nodes_grouped_by_level_, long node_count_) : unit_diagonal(unit_diagonal_), invert_diagonal(invert_diagonal_), invert_offdiagonal(invert_offdiagonal_), @@ -833,199 +776,184 @@ struct LowerTriSupernodalFunctor { nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_) {} - // operator - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ - /* get inputs */ - /* ---------------------------------------------------------------------- */ - const int league_rank = team.league_rank(); // batch id - const int team_size = team.team_size(); - const int team_rank = team.team_rank(); - const scalar_t zero(0.0); - const scalar_t one(1.0); - - auto s = nodes_grouped_by_level(node_count + league_rank); - - // supernodal column size - const int j1 = supercols[s]; - const int j2 = supercols[s + 1]; - // > number of columns in the s-th supernode column - const int nscol = j2 - j1; - // "total" number of rows in all the supernodes (diagonal+off-diagonal) - const int i1 = colptr(j1); - const int nsrow = colptr(j1 + 1) - i1; - - // create a view for the s-th supernocal column - // NOTE: we currently supports only default_layout = LayoutLeft - scalar_t *dataL = const_cast(values.data()); - Kokkos::View + // operator + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + /* ---------------------------------------------------------------------- */ + /* get inputs */ + /* ---------------------------------------------------------------------- */ + const int league_rank = team.league_rank(); // batch id + const int team_size = team.team_size(); + const int team_rank = team.team_rank(); + const scalar_t zero(0.0); + const scalar_t one(1.0); + + auto s = nodes_grouped_by_level(node_count + league_rank); + + // supernodal column size + const int j1 = supercols[s]; + const int j2 = supercols[s + 1]; + // > number of columns in the s-th supernode column + const int nscol = j2 - j1; + // "total" number of rows in all the supernodes (diagonal+off-diagonal) + const int i1 = colptr(j1); + const int nsrow = colptr(j1 + 1) - i1; + + // create a view for the s-th supernocal column + // NOTE: we currently supports only default_layout = LayoutLeft + scalar_t *dataL = const_cast(values.data()); + Kokkos::View viewL(&dataL[i1], nsrow, nscol); - // extract part of the solution, corresponding to the diagonal block - auto Xj = Kokkos::subview(X, range_type(j1, j2)); + // extract part of the solution, corresponding to the diagonal block + auto Xj = Kokkos::subview(X, range_type(j1, j2)); - // workspace - const int workoffset = work_offset(s); - auto Z = Kokkos::subview( + // workspace + const int workoffset = work_offset(s); + auto Z = Kokkos::subview( work, range_type(workoffset + nscol, workoffset + nsrow)); - if (diag_kernel_type(level) != 3) { // not a device-level TRSM-solve - if (invert_offdiagonal) { - // combined TRSM solve with diagonal + GEMV update with off-diagonal - auto Y = Kokkos::subview( + if (diag_kernel_type(level) != 3) { // not a device-level TRSM-solve + if (invert_offdiagonal) { + // combined TRSM solve with diagonal + GEMV update with off-diagonal + auto Y = Kokkos::subview( work, range_type( workoffset, workoffset + nsrow)); // needed for gemv instead of trmv/trsv - auto Ljj = Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Ljj, Xj, - zero, - Y); - team.team_barrier(); - for (int ii = team_rank; ii < nscol; ii += team_size) { - Xj(ii) = Y(ii); - } - team.team_barrier(); - } else { - /* TRSM with diagonal block */ - // extract diagonal and off-diagonal blocks of L - auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL()); - if (invert_diagonal) { - // workspace - auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nscol)); // needed for gemv instead of trmv/trsv - for (int ii = team_rank; ii < nscol; ii += team_size) { - Y(ii) = Xj(ii); - } - team.team_barrier(); - // calling team-level "Unblocked" gemv on small-size diagonal in - // KokkosBatched + auto Ljj = Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL()); KokkosBlas::TeamGemv::invoke(team, one, - Ljj, - Y, + Ljj, Xj, zero, - Xj); + Y); + team.team_barrier(); + for (int ii = team_rank; ii < nscol; ii += team_size) { + Xj(ii) = Y(ii); + } + team.team_barrier(); } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View + /* TRSM with diagonal block */ + // extract diagonal and off-diagonal blocks of L + auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL()); + if (invert_diagonal) { + // workspace + auto Y = Kokkos::subview( + work, + range_type( + workoffset, + workoffset + nscol)); // needed for gemv instead of trmv/trsv + for (int ii = team_rank; ii < nscol; ii += team_size) { + Y(ii) = Xj(ii); + } + team.team_barrier(); + // calling team-level "Unblocked" gemv on small-size diagonal in + // KokkosBatched + KokkosBlas::TeamGemv::invoke(team, + one, + Ljj, + Y, + zero, + Xj); + } else { + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View Xjj(Xj.data(), nscol, 1); - if (unit_diagonal) { - KokkosBatched::TeamTrsm< + if (unit_diagonal) { + KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, Xjj); - } else { - KokkosBatched::TeamTrsm< + } else { + KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, Xjj); + } } - } - team.team_barrier(); + team.team_barrier(); - /* GEMM to update with off diagonal blocks */ - auto Lij = + /* GEMM to update with off diagonal blocks */ + auto Lij = Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Lij, Xj, - zero, - Z); - team.team_barrier(); + KokkosBlas::TeamGemv::invoke(team, + one, + Lij, Xj, + zero, + Z); + team.team_barrier(); + } } - } - /* scatter vectors back into X */ - int i2 = i1 + nscol; // offset into rowind - int nsrow2 = + /* scatter vectors back into X */ + int i2 = i1 + nscol; // offset into rowind + int nsrow2 = nsrow - nscol; // "total" number of rows in all the off-diagonal supernodes - Kokkos::View> + Kokkos::View> Xatomic(X.data(), X.extent(0)); - for (int ii = team_rank; ii < nsrow2; ii += team_size) { - int i = rowind(i2 + ii); - Xatomic(i) -= Z(ii); + for (int ii = team_rank; ii < nsrow2; ii += team_size) { + int i = rowind(i2 + ii); + Xatomic(i) -= Z(ii); + } + team.team_barrier(); } - team.team_barrier(); - } -}; - -// ----------------------------------------------------------- -// Functor for Upper-triangular solve in CSR -template -struct UpperTriSupernodalFunctor { - using execution_space = typename TriSolveHandle::HandleExecSpace; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - using scalar_t = typename ValuesType::non_const_value_type; + }; - using integer_view_t = Kokkos::View; - using work_view_t = - typename Kokkos::View>; - - // NOTE: we currently supports only default_layout = LayoutLeft - using SupernodeView = - typename Kokkos::View + struct UpperTriSupernodalFunctor { + // NOTE: we currently supports only default_layout = LayoutLeft + using SupernodeView = + typename Kokkos::View; - using range_type = Kokkos::pair; + bool invert_diagonal; + const int *supercols; + ColptrType colptr; + RowindType rowind; + ValuesType values; - bool invert_diagonal; - const int *supercols; - ColptrType colptr; - RowindType rowind; - ValuesType values; + int level; + work_view_int_t kernel_type; + work_view_int_t diag_kernel_type; - int level; - integer_view_t kernel_type; - integer_view_t diag_kernel_type; + LHSType X; - LHSType X; + work_view_t work; // needed with gemv for update&scatter + work_view_int_t work_offset; - work_view_t work; // needed with gemv for update&scatter - integer_view_t work_offset; + entries_t nodes_grouped_by_level; - NGBLType nodes_grouped_by_level; + long node_count; - long node_count; - - // constructor - UpperTriSupernodalFunctor( // supernode info + // constructor + UpperTriSupernodalFunctor( // supernode info bool invert_diagonal_, const int *supercols_, // U in CSR const ColptrType &colptr_, const RowindType &rowind_, const ValuesType &values_, // options to pick kernel type - int level_, integer_view_t &kernel_type_, - integer_view_t &diag_kernel_type_, + int level_, work_view_int_t &kernel_type_, + work_view_int_t &diag_kernel_type_, // right-hand-side (input), solution (output) LHSType &X_, // workspace - work_view_t &work_, integer_view_t &work_offset_, + work_view_t &work_, work_view_int_t &work_offset_, // - const NGBLType &nodes_grouped_by_level_, long node_count_) + const entries_t &nodes_grouped_by_level_, long node_count_) : invert_diagonal(invert_diagonal_), supercols(supercols_), colptr(colptr_), @@ -1040,145 +968,130 @@ struct UpperTriSupernodalFunctor { nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_) {} - // operator - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ - /* get inputs */ - /* ---------------------------------------------------------------------- */ - const int league_rank = team.league_rank(); // batch id - const int team_size = team.team_size(); - const int team_rank = team.team_rank(); - const scalar_t zero(0.0); - const scalar_t one(1.0); - - auto s = nodes_grouped_by_level(node_count + league_rank); - - // number of columns in the s-th supernode column - int j1 = supercols[s]; - int j2 = supercols[s + 1]; - int nscol = j2 - j1; - // "total" number of rows in all the supernodes (diagonal+off-diagonal) - int i1 = colptr(j1); - int nsrow = colptr(j1 + 1) - i1; - - // create a view of the s-th supernocal row of U - scalar_t *dataU = const_cast(values.data()); - SupernodeView viewU(&dataU[i1], nsrow, nscol); - - // extract part of solution, corresponding to the diagonal block U(s, s) - auto Xj = Kokkos::subview(X, range_type(j1, j2)); - using Xj_type = decltype(Xj); - - // workspaces - int workoffset = work_offset(s); - - // "total" number of rows in all the off-diagonal supernodes - int nsrow2 = nsrow - nscol; - /* gather vector into Z */ - int i2 = i1 + nscol; // offset into rowind - auto Z = Kokkos::subview( + // operator + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + /* ---------------------------------------------------------------------- */ + /* get inputs */ + /* ---------------------------------------------------------------------- */ + const int league_rank = team.league_rank(); // batch id + const int team_size = team.team_size(); + const int team_rank = team.team_rank(); + const scalar_t zero(0.0); + const scalar_t one(1.0); + + auto s = nodes_grouped_by_level(node_count + league_rank); + + // number of columns in the s-th supernode column + int j1 = supercols[s]; + int j2 = supercols[s + 1]; + int nscol = j2 - j1; + // "total" number of rows in all the supernodes (diagonal+off-diagonal) + int i1 = colptr(j1); + int nsrow = colptr(j1 + 1) - i1; + + // create a view of the s-th supernocal row of U + scalar_t *dataU = const_cast(values.data()); + SupernodeView viewU(&dataU[i1], nsrow, nscol); + + // extract part of solution, corresponding to the diagonal block U(s, s) + auto Xj = Kokkos::subview(X, range_type(j1, j2)); + using Xj_type = decltype(Xj); + + // workspaces + int workoffset = work_offset(s); + + // "total" number of rows in all the off-diagonal supernodes + int nsrow2 = nsrow - nscol; + /* gather vector into Z */ + int i2 = i1 + nscol; // offset into rowind + auto Z = Kokkos::subview( work, range_type(workoffset + nscol, workoffset + nsrow)); // needed with gemv for update&scatter - using Z_type = decltype(Z); - for (int ii = team_rank; ii < nsrow2; ii += team_size) { - int i = rowind(i2 + ii); - Z(ii) = X(i); - } - team.team_barrier(); - /* GEMM to update with off diagonal blocks, Xj = -Uij^T * Z */ - if (diag_kernel_type(level) != 3) { - // not device-level GEMV-udpate - auto Uij = + using Z_type = decltype(Z); + for (int ii = team_rank; ii < nsrow2; ii += team_size) { + int i = rowind(i2 + ii); + Z(ii) = X(i); + } + team.team_barrier(); + /* GEMM to update with off diagonal blocks, Xj = -Uij^T * Z */ + if (diag_kernel_type(level) != 3) { + // not device-level GEMV-udpate + auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); - using Uij_type = decltype(Uij); - KokkosBlas::TeamGemv:: + using Uij_type = decltype(Uij); + KokkosBlas::TeamGemv:: template invoke( team, -one, Uij, Z, one, Xj); - team.team_barrier(); + team.team_barrier(); - /* TRSM with diagonal block */ - // extract diagonal and off-diagonal blocks of U - auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); - using Ujj_type = decltype(Ujj); + /* TRSM with diagonal block */ + // extract diagonal and off-diagonal blocks of U + auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); + using Ujj_type = decltype(Ujj); - if (invert_diagonal) { - // workspace - auto Y = Kokkos::subview( + if (invert_diagonal) { + // workspace + auto Y = Kokkos::subview( work, range_type( workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv - using Y_type = decltype(Y); - for (int ii = team_rank; ii < nscol; ii += team_size) { - Y(ii) = Xj(ii); - } - team.team_barrier(); + using Y_type = decltype(Y); + for (int ii = team_rank; ii < nscol; ii += team_size) { + Y(ii) = Xj(ii); + } + team.team_barrier(); - // caling team-level kernel in KokkosBatched on a small-size diagonal - KokkosBlas::TeamGemv:: + // caling team-level kernel in KokkosBatched on a small-size diagonal + KokkosBlas::TeamGemv:: template invoke( team, one, Ujj, Y, zero, Xj); - } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View + } else { + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBatched::TeamTrsm< + KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::Transpose, KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj); + } + team.team_barrier(); } - team.team_barrier(); } - } -}; - -// ----------------------------------------------------------- -// Functor for Upper-triangular solve in CSC -template -struct UpperTriTranSupernodalFunctor { - using execution_space = typename TriSolveHandle::HandleExecSpace; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; + }; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + // ----------------------------------------------------------- + // Functor for Upper-triangular solve in CSC + template + struct UpperTriTranSupernodalFunctor { + const bool invert_diagonal; + const bool invert_offdiagonal; + const int *supercols; + ColptrType colptr; + RowindType rowind; + ValuesType values; - using scalar_t = typename ValuesType::non_const_value_type; + int level; + work_view_int_t kernel_type; + work_view_int_t diag_kernel_type; - using integer_view_t = Kokkos::View; - using work_view_t = - typename Kokkos::View>; + LHSType X; - using range_type = Kokkos::pair; + work_view_t work; // needed with gemv for update&scatter + work_view_int_t work_offset; - const bool invert_diagonal; - const bool invert_offdiagonal; - const int *supercols; - ColptrType colptr; - RowindType rowind; - ValuesType values; + entries_t nodes_grouped_by_level; - int level; - integer_view_t kernel_type; - integer_view_t diag_kernel_type; + long node_count; - LHSType X; - - work_view_t work; // needed with gemv for update&scatter - integer_view_t work_offset; - - NGBLType nodes_grouped_by_level; - - long node_count; - - // constructor - UpperTriTranSupernodalFunctor( // supernode info + // constructor + UpperTriTranSupernodalFunctor( // supernode info const bool invert_diagonal_, const bool invert_offdiagonal_, const int *supercols_, @@ -1186,14 +1099,14 @@ struct UpperTriTranSupernodalFunctor { const ColptrType &colptr_, const RowindType &rowind_, const ValuesType &values_, // options to pick kernel type - const int level_, const integer_view_t &kernel_type_, - const integer_view_t &diag_kernel_type_, + const int level_, const work_view_int_t &kernel_type_, + const work_view_int_t &diag_kernel_type_, // right-hand-side (input), solution (output) const LHSType &X_, // workspace - const work_view_t &work_, const integer_view_t &work_offset_, + const work_view_t &work_, const work_view_int_t &work_offset_, // - const NGBLType &nodes_grouped_by_level_, const long node_count_) + const entries_t &nodes_grouped_by_level_, const long node_count_) : invert_diagonal(invert_diagonal_), invert_offdiagonal(invert_offdiagonal_), supercols(supercols_), @@ -1209,148 +1122,147 @@ struct UpperTriTranSupernodalFunctor { nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_) {} - // operator - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ - /* get inputs */ - /* ---------------------------------------------------------------------- */ - const int league_rank = team.league_rank(); // batch id - const int team_size = team.team_size(); - const int team_rank = team.team_rank(); - const scalar_t zero(0.0); - const scalar_t one(1.0); - - auto s = nodes_grouped_by_level(node_count + league_rank); - - // number of columns in the s-th supernode column - const int j1 = supercols[s]; - const int j2 = supercols[s + 1]; - const int nscol = j2 - j1; - // "total" number of rows in all the supernodes (diagonal+off-diagonal) - const int i1 = colptr(j1); - const int nsrow = colptr(j1 + 1) - i1; - // "total" number of rows in all the off-diagonal supernodes - const int nsrow2 = nsrow - nscol; - - // create a view of the s-th supernocal column of U - // NOTE: we currently supports only default_layout = LayoutLeft - scalar_t *dataU = const_cast(values.data()); - Kokkos::View + // operator + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + /* ---------------------------------------------------------------------- */ + /* get inputs */ + /* ---------------------------------------------------------------------- */ + const int league_rank = team.league_rank(); // batch id + const int team_size = team.team_size(); + const int team_rank = team.team_rank(); + const scalar_t zero(0.0); + const scalar_t one(1.0); + + auto s = nodes_grouped_by_level(node_count + league_rank); + + // number of columns in the s-th supernode column + const int j1 = supercols[s]; + const int j2 = supercols[s + 1]; + const int nscol = j2 - j1; + // "total" number of rows in all the supernodes (diagonal+off-diagonal) + const int i1 = colptr(j1); + const int nsrow = colptr(j1 + 1) - i1; + // "total" number of rows in all the off-diagonal supernodes + const int nsrow2 = nsrow - nscol; + + // create a view of the s-th supernocal column of U + // NOTE: we currently supports only default_layout = LayoutLeft + scalar_t *dataU = const_cast(values.data()); + Kokkos::View viewU(&dataU[i1], nsrow, nscol); - // extract part of solution, corresponding to the diagonal block U(s, s) - auto Xj = Kokkos::subview(X, range_type(j1, j2)); + // extract part of solution, corresponding to the diagonal block U(s, s) + auto Xj = Kokkos::subview(X, range_type(j1, j2)); - // workspaces - int workoffset = work_offset(s); - - /* TRSM with diagonal block */ - if (diag_kernel_type(level) != 3) { - // not device-level TRSM-solve - if (invert_offdiagonal) { - // extract diagonal + off-diagonal blocks of U - auto Y = Kokkos::subview( + // workspaces + int workoffset = work_offset(s); + + /* TRSM with diagonal block */ + if (diag_kernel_type(level) != 3) { + // not device-level TRSM-solve + if (invert_offdiagonal) { + // extract diagonal + off-diagonal blocks of U + auto Y = Kokkos::subview( work, range_type( workoffset, workoffset + nsrow)); // needed with gemv for update&scatter - auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Uij, Xj, - zero, - Y); - team.team_barrier(); - // copy the diagonal back to output - for (int ii = team_rank; ii < nscol; ii += team_size) { - Xj(ii) = Y(ii); - } - } else { - // extract diagonal block of U (stored on top) - auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); - if (invert_diagonal) { - auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nscol)); // needed for gemv instead of trmv/trsv - for (int ii = team_rank; ii < nscol; ii += team_size) { - Y(ii) = Xj(ii); - } - team.team_barrier(); + auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); KokkosBlas::TeamGemv::invoke(team, one, - Ujj, - Y, + Uij, Xj, zero, - Xj); + Y); + team.team_barrier(); + // copy the diagonal back to output + for (int ii = team_rank; ii < nscol; ii += team_size) { + Xj(ii) = Y(ii); + } } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View + // extract diagonal block of U (stored on top) + auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); + if (invert_diagonal) { + auto Y = Kokkos::subview( + work, + range_type( + workoffset, + workoffset + nscol)); // needed for gemv instead of trmv/trsv + for (int ii = team_rank; ii < nscol; ii += team_size) { + Y(ii) = Xj(ii); + } + team.team_barrier(); + KokkosBlas::TeamGemv::invoke(team, + one, + Ujj, + Y, + zero, + Xj); + } else { + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBatched::TeamTrsm< + KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj); + } } + team.team_barrier(); } - team.team_barrier(); - } - if (nsrow2 > 0) { - /* GEMM to update off diagonal blocks, Z = Uij * Xj */ - auto Z = Kokkos::subview( + if (nsrow2 > 0) { + /* GEMM to update off diagonal blocks, Z = Uij * Xj */ + auto Z = Kokkos::subview( work, range_type(workoffset + nscol, workoffset + nsrow)); - if (!invert_offdiagonal && diag_kernel_type(level) != 3) { - // not device-level TRSM-solve - auto Uij = + if (!invert_offdiagonal && diag_kernel_type(level) != 3) { + // not device-level TRSM-solve + auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Uij, Xj, - zero, - Z); - team.team_barrier(); - } - - /* scatter vector into Z */ - int i2 = i1 + nscol; // offset into rowind - Kokkos::View> + KokkosBlas::TeamGemv::invoke(team, + one, + Uij, Xj, + zero, + Z); + team.team_barrier(); + } + + /* scatter vector into Z */ + int i2 = i1 + nscol; // offset into rowind + Kokkos::View> Xatomic(X.data(), X.extent(0)); - for (int ii = team_rank; ii < nsrow2; ii += team_size) { - int i = rowind(i2 + ii); - Xatomic(i) -= Z(ii); + for (int ii = team_rank; ii < nsrow2; ii += team_size) { + int i = rowind(i2 + ii); + Xatomic(i) -= Z(ii); + } + team.team_barrier(); } - team.team_barrier(); } - } -}; + }; #endif -template -struct UpperTriLvlSchedRPSolverFunctor { - typedef typename EntriesType::non_const_value_type lno_t; - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_) + template + struct UpperTriLvlSchedRPSolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_) : row_map(row_map_), entries(entries_), values(values_), @@ -1358,71 +1270,65 @@ struct UpperTriLvlSchedRPSolverFunctor { rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - // Assuming indices are sorted per row, diag entry is final index in the - // list - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - lhs(rowid) = rhs_rowid / val; - } - } // end for ptr - } + KOKKOS_INLINE_FUNCTION + void operator()(const lno_t i) const { + auto rowid = nodes_grouped_by_level(i); + // Assuming indices are sorted per row, diag entry is final index in the + // list + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + rhs_rowid = rhs_rowid - val * lhs(colid); + } else { + lhs(rowid) = rhs_rowid / val; + } + } // end for ptr + } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - auto diag = -1; - for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - diag = ptr; - } - } // end for ptr - lhs(rowid) = rhs_rowid / values(diag); - } -}; - -template -struct UpperTriLvlSchedTP1SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const lno_t i) const { + auto rowid = nodes_grouped_by_level(i); + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + auto diag = -1; + for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + rhs_rowid = rhs_rowid - val * lhs(colid); + } else { + diag = ptr; + } + } // end for ptr + lhs(rowid) = rhs_rowid / values(diag); + } + }; + + template + struct UpperTriLvlSchedTP1SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + + UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + long node_count_, long node_groups_ = 0) : row_map(row_map_), entries(entries_), values(values_), @@ -1432,18 +1338,18 @@ struct UpperTriLvlSchedTP1SolverFunctor { node_count(node_count_), node_groups(node_groups_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); @@ -1454,30 +1360,30 @@ struct UpperTriLvlSchedTP1SolverFunctor { }, diff); - team.team_barrier(); + team.team_barrier(); - // At end, finalize rowid == colid - // only one thread should do this, also can use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at start offset - lhs(rowid) = (rhs_rowid + diff) / values(soffset); + // At end, finalize rowid == colid + // only one thread should do this, also can use Kokkos::single + if (my_rank == 0) { + // ASSUMPTION: sorted diagonal value located at start offset + lhs(rowid) = (rhs_rowid + diff) / values(soffset); + } } - } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - auto diag = -1; + auto diag = -1; - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); @@ -1489,45 +1395,39 @@ struct UpperTriLvlSchedTP1SolverFunctor { } }, diff); - team.team_barrier(); + team.team_barrier(); - // At end, finalize rowid == colid - // only one thread should do this, also can use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); + // At end, finalize rowid == colid + // only one thread should do this, also can use Kokkos::single + if (my_rank == 0) { + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } } - } -}; - -// FIXME CUDA: This algorithm not working with all integral type combos -// In any case, this serves as a skeleton for 3-level hierarchical parallelism -// for alg dev -template -struct UpperTriLvlSchedTP2SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) + }; + + // FIXME CUDA: This algorithm not working with all integral type combos + // In any case, this serves as a skeleton for 3-level hierarchical parallelism + // for alg dev + template + struct UpperTriLvlSchedTP2SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + + UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + long node_count_, long node_groups_ = 0) : row_map(row_map_), entries(entries_), values(values_), @@ -1537,13 +1437,13 @@ struct UpperTriLvlSchedTP2SolverFunctor { node_count(node_count_), node_groups(node_groups_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid - size_t nrows = row_map.extent(0) - 1; + size_t nrows = row_map.extent(0) - 1; - Kokkos::parallel_for( + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { auto rowid = nodes_grouped_by_level(node_count + my_league * node_groups + ng); @@ -1569,16 +1469,16 @@ struct UpperTriLvlSchedTP2SolverFunctor { } // end if }); // end TeamThreadRange - team.team_barrier(); - } + team.team_barrier(); + } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid - size_t nrows = row_map.extent(0) - 1; + size_t nrows = row_map.extent(0) - 1; - Kokkos::parallel_for( + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { auto rowid = nodes_grouped_by_level(node_count + my_league * node_groups + ng); @@ -1606,42 +1506,36 @@ struct UpperTriLvlSchedTP2SolverFunctor { } // end if }); // end TeamThreadRange - team.team_barrier(); - } -}; - -// -------------------------------- -// Single-block functors -// -------------------------------- - -template -struct LowerTriLvlSchedTP1SingleBlockFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - NGBLType nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - long cutoff; - // team_size: each team can be assigned a row, if there are enough rows... + team.team_barrier(); + } + }; + + // -------------------------------- + // Single-block functors + // -------------------------------- + + template + struct LowerTriLvlSchedTP1SingleBlockFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + entries_t nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long lvl_start; + long lvl_end; + long cutoff; + // team_size: each team can be assigned a row, if there are enough rows... LowerTriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) : row_map(row_map_), entries(entries_), @@ -1655,41 +1549,41 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor { lvl_end(lvl_end_), cutoff(cutoff_) {} - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } } - } #else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -1701,51 +1595,51 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } } - } #else - auto trange = eoffset - soffset; - auto diag = -1; + auto trange = eoffset - soffset; + auto diag = -1; - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -1760,51 +1654,51 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const LargerCutoffTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } } - } #else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -1816,58 +1710,58 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower + // tri, soffset for upper tri + lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedLargerCutoffTag &, + const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } } - } #else - auto trange = eoffset - soffset; - auto diag = -1; + auto trange = eoffset - soffset; + auto diag = -1; - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -1881,47 +1775,41 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator -}; - -template -struct UpperTriLvlSchedTP1SingleBlockFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - NGBLType nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - long cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - UpperTriLvlSchedTP1SingleBlockFunctor( + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + }; + + template + struct UpperTriLvlSchedTP1SingleBlockFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + entries_t nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long lvl_start; + long lvl_end; + long cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + UpperTriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) : row_map(row_map_), entries(entries_), @@ -1935,41 +1823,40 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor { lvl_end(lvl_end_), cutoff(cutoff_) {} - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } } - } #else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -1981,55 +1868,55 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - // ASSUMPTION: sorted diagonal value located at soffset - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // each thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // ASSUMPTION: sorted diagonal value located at soffset + lhs(rowid) = (rhs_val + diff) / values(soffset); + } // end if + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // each thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } else { + diag = ptr; + } } - } #else - auto trange = eoffset - soffset; - auto diag = -1; + auto trange = eoffset - soffset; + auto diag = -1; - Kokkos::parallel_reduce( + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -2043,52 +1930,52 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // each thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // each thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const LargerCutoffTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } } - } #else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -2100,60 +1987,60 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower + // tri, soffset for upper tri + lhs(rowid) = (rhs_val + diff) / values(soffset); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedLargerCutoffTag &, + const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } else { + diag = ptr; + } } - } #else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + auto diag = -1; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -2167,49 +2054,43 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator -}; - -template -struct TriLvlSchedTP1SingleBlockFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - NGBLType nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - const bool is_lowertri; - const int dense_nrows; - const int cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - TriLvlSchedTP1SingleBlockFunctor( + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + }; + + template + struct TriLvlSchedTP1SingleBlockFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + entries_t nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long lvl_start; + long lvl_end; + const bool is_lowertri; + const int dense_nrows; + const int cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + TriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0) : row_map(row_map_), @@ -2226,41 +2107,41 @@ struct TriLvlSchedTP1SingleBlockFunctor { dense_nrows(dense_nrows_), cutoff(cutoff_) {} - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } } - } #else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -2273,58 +2154,58 @@ struct TriLvlSchedTP1SingleBlockFunctor { diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - if (is_lowertri) - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - else - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower + // tri, soffset for upper tri + if (is_lowertri) + lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); + else + lhs(rowid) = (rhs_val + diff) / values(soffset); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } else { + diag = ptr; + } } - } #else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + auto diag = -1; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -2338,52 +2219,52 @@ struct TriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const LargerCutoffTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } } - } #else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -2396,63 +2277,63 @@ struct TriLvlSchedTP1SingleBlockFunctor { diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - if (is_lowertri) - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - else - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower + // tri, soffset for upper tri + if (is_lowertri) + lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); + else + lhs(rowid) = (rhs_val + diff) / values(soffset); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedLargerCutoffTag &, + const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } else { + diag = ptr; + } } - } #else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( + auto trange = eoffset - soffset; + auto diag = -1; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -2466,50 +2347,44 @@ struct TriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator -}; - -template -struct TriLvlSchedTP1SingleBlockFunctorDiagValues { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - NGBLType nodes_per_level; - ValuesType diagonal_values; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - const bool is_lowertri; - const int dense_nrows; - const int cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - TriLvlSchedTP1SingleBlockFunctorDiagValues( + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + }; + + template + struct TriLvlSchedTP1SingleBlockFunctorDiagValues { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + entries_t nodes_per_level; + ValuesType diagonal_values; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long lvl_start; + long lvl_end; + const bool is_lowertri; + const int dense_nrows; + const int cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + TriLvlSchedTP1SingleBlockFunctorDiagValues( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, const NGBLType &nodes_per_level_, + const entries_t &nodes_grouped_by_level_, const entries_t &nodes_per_level_, const ValuesType &diagonal_values_, long node_count_, const long lvl_start_, const long lvl_end_, const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0) @@ -2528,83 +2403,23 @@ struct TriLvlSchedTP1SingleBlockFunctorDiagValues { dense_nrows(dense_nrows_), cutoff(cutoff_) {} - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); if (my_rank < nodes_this_lvl) { // THIS is where the mapping of threadid to rowid happens rowid = nodes_grouped_by_level(my_rank + mut_node_count); @@ -2623,6 +2438,66 @@ struct TriLvlSchedTP1SingleBlockFunctorDiagValues { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); +#endif + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower + // tri, soffset for upper tri + lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const LargerCutoffTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { auto ptr = soffset + loffset; @@ -2634,360 +2509,228 @@ struct TriLvlSchedTP1SingleBlockFunctorDiagValues { }, diff); #endif - lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator -}; + lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl + // per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + }; #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT -template -struct ReturnTeamPolicyType; - -#ifdef KOKKOS_ENABLE_SERIAL -template <> -struct ReturnTeamPolicyType { - using PolicyType = Kokkos::TeamPolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) { - return PolicyType(nt, ts); - // return PolicyType(ExecInstanceType(),nt,ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_OPENMP -template <> -struct ReturnTeamPolicyType { - using PolicyType = Kokkos::TeamPolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) { - return PolicyType(nt, ts); - // return PolicyType(ExecInstanceType(),nt,ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct ReturnTeamPolicyType { - using PolicyType = Kokkos::TeamPolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { - return PolicyType(stream, nt, ts); - } -}; -#endif - -template -struct ReturnRangePolicyType; - -#ifdef KOKKOS_ENABLE_SERIAL -template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) { - return PolicyType(nt, ts); - // return PolicyType(ExecInstanceType(),nt,ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_OPENMP -template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) { - return PolicyType(nt, ts); - // return PolicyType(ExecInstanceType(),nt,ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { - return PolicyType(stream, nt, ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_HIP -template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { - return PolicyType(stream, nt, ts); - } -}; -#endif - -template -static void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - typedef typename TriSolveHandle::execution_space execution_space; - typedef typename TriSolveHandle::size_type size_type; - typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = + template + static void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, + const EntriesType entries, const ValuesType values, + const RHSType &rhs, LHSType &lhs) { + typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = thandle.get_sptrsvCudaGraph(); - auto nlevels = thandle.get_num_levels(); + auto nlevels = thandle.get_num_levels(); - auto stream1 = lcl_cudagraph->stream; - Kokkos::Cuda cuda1(stream1); - auto graph = lcl_cudagraph->cudagraph; + auto stream1 = lcl_cudagraph->stream; + Kokkos::Cuda cuda1(stream1); + auto graph = lcl_cudagraph->cudagraph; - Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), - EmptyFunctor()); - Kokkos::Cuda().fence(); - cudaStreamSynchronize(stream1); - // Kokkos::fence(); + Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), + EmptyFunctor()); + Kokkos::Cuda().fence(); + cudaStreamSynchronize(stream1); + // Kokkos::fence(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - size_type node_count = 0; + size_type node_count = 0; - int team_size = thandle.get_team_size(); - team_size = team_size == -1 ? 64 : team_size; + int team_size = thandle.get_team_size(); + team_size = team_size == -1 ? 64 : team_size; - // Start capturing stream - if (thandle.cudagraphCreated == false) { - Kokkos::fence(); - cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); - { - for (int iter = 0; iter < nlevels; ++iter) { - size_type lvl_nodes = hnodes_per_level(iter); + // Start capturing stream + if (thandle.cudagraphCreated == false) { + Kokkos::fence(); + cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); + { + for (int iter = 0; iter < nlevels; ++iter) { + size_type lvl_nodes = hnodes_per_level(iter); - using policy_type = ReturnTeamPolicyType; + auto policy = std::is_same::value + ? team_policy(lvl_nodes, team_size, cuda1) + : team_policy(lvl_nodes, team_size); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_l_team_cudagraph", Kokkos::Experimental::require( - ReturnTeamPolicyType::get_policy( - lvl_nodes, team_size, cuda1), + policy, Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedTP1SolverFunctor( + ValuesType, LHSType, RHSType>( row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); - node_count += hnodes_per_level(iter); + node_count += hnodes_per_level(iter); + } } + cudaStreamEndCapture(stream1, &graph); + + // Create graphExec + cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0); + thandle.cudagraphCreated = true; } - cudaStreamEndCapture(stream1, &graph); + // Run graph + Kokkos::fence(); + cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); - // Create graphExec - cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, - 0); - thandle.cudagraphCreated = true; - } - // Run graph - Kokkos::fence(); - cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); - - cudaStreamSynchronize(stream1); - Kokkos::fence(); -} // end lower_tri_solve_cg - -template -static void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - typedef typename TriSolveHandle::execution_space execution_space; - typedef typename TriSolveHandle::size_type size_type; - typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = + cudaStreamSynchronize(stream1); + Kokkos::fence(); + } // end lower_tri_solve_cg + + template + static void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, + const EntriesType entries, const ValuesType values, + const RHSType &rhs, LHSType &lhs) { + typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = thandle.get_sptrsvCudaGraph(); - auto nlevels = thandle.get_num_levels(); + auto nlevels = thandle.get_num_levels(); - auto stream1 = lcl_cudagraph->stream; - Kokkos::Cuda cuda1(stream1); - auto graph = lcl_cudagraph->cudagraph; + auto stream1 = lcl_cudagraph->stream; + Kokkos::Cuda cuda1(stream1); + auto graph = lcl_cudagraph->cudagraph; - Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), - EmptyFunctor()); - Kokkos::Cuda().fence(); - cudaStreamSynchronize(stream1); + Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), + EmptyFunctor()); + Kokkos::Cuda().fence(); + cudaStreamSynchronize(stream1); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - size_type node_count = 0; + size_type node_count = 0; - int team_size = thandle.get_team_size(); - team_size = team_size == -1 ? 64 : team_size; + int team_size = thandle.get_team_size(); + team_size = team_size == -1 ? 64 : team_size; - // Start capturing stream - if (thandle.cudagraphCreated == false) { - Kokkos::fence(); - cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); - { - for (int iter = 0; iter < nlevels; ++iter) { - size_type lvl_nodes = hnodes_per_level(iter); + // Start capturing stream + if (thandle.cudagraphCreated == false) { + Kokkos::fence(); + cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); + { + for (int iter = 0; iter < nlevels; ++iter) { + size_type lvl_nodes = hnodes_per_level(iter); - using policy_type = ReturnTeamPolicyType; + auto policy = std::is_same::value + ? team_policy(lvl_nodes, team_size, cuda1) + : team_policy(lvl_nodes, team_size); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_u_team_cudagraph", Kokkos::Experimental::require( - ReturnTeamPolicyType::get_policy( - lvl_nodes, team_size, cuda1), + policy, Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedTP1SolverFunctor( + ValuesType, LHSType, RHSType>( row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); - node_count += hnodes_per_level(iter); + node_count += hnodes_per_level(iter); + } } - } - cudaStreamEndCapture(stream1, &graph); + cudaStreamEndCapture(stream1, &graph); - // Create graphExec - cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, - 0); - thandle.cudagraphCreated = true; - } - // Run graph - Kokkos::fence(); - cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); + // Create graphExec + cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0); + thandle.cudagraphCreated = true; + } + // Run graph + Kokkos::fence(); + cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); - cudaStreamSynchronize(stream1); - Kokkos::fence(); -} // end upper_tri_solve_cg + cudaStreamSynchronize(stream1); + Kokkos::fence(); + } // end upper_tri_solve_cg #endif -template -static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { + template + static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStop(); + cudaProfilerStop(); #endif - typedef typename TriSolveHandle::size_type size_type; - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - - auto nlevels = thandle.get_num_levels(); - // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + auto nlevels = thandle.get_num_levels(); + // Keep this a host View, create device version and copy to back to host + // during scheduling This requires making sure the host view in the handle is + // properly updated after the symbolic phase + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - using device_t = Kokkos::Device; - using integer_view_t = typename TriSolveHandle::integer_view_t; - using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; - using scalar_t = typename ValuesType::non_const_value_type; - using range_type = Kokkos::pair; - using row_map_host_view_t = Kokkos::View; + using namespace KokkosSparse::Experimental; + using device_t = Kokkos::Device; + using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; + using scalar_t = typename ValuesType::non_const_value_type; + using row_map_host_view_t = Kokkos::View; - row_map_host_view_t row_map_host; + row_map_host_view_t row_map_host; - const scalar_t zero(0.0); - const scalar_t one(1.0); + const scalar_t zero(0.0); + const scalar_t one(1.0); - auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); - if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { - Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); + if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - row_map_host = row_map_host_view_t( + row_map_host = row_map_host_view_t( Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), row_map.extent(0)); - Kokkos::deep_copy(row_map_host, row_map); - } + Kokkos::deep_copy(row_map_host, row_map); + } - // inversion options - const bool invert_diagonal = thandle.get_invert_diagonal(); - const bool invert_offdiagonal = thandle.get_invert_offdiagonal(); - const bool unit_diagonal = thandle.is_unit_diagonal(); + // inversion options + const bool invert_diagonal = thandle.get_invert_diagonal(); + const bool invert_offdiagonal = thandle.get_invert_offdiagonal(); + const bool unit_diagonal = thandle.is_unit_diagonal(); - // supernode sizes - const int *supercols = thandle.get_supercols(); - const int *supercols_host = thandle.get_supercols_host(); + // supernode sizes + const int *supercols = thandle.get_supercols(); + const int *supercols_host = thandle.get_supercols_host(); - // kernel types - integer_view_t kernel_type = thandle.get_kernel_type(); - integer_view_t diag_kernel_type = thandle.get_diag_kernel_type(); + // kernel types + work_view_int_t kernel_type = thandle.get_kernel_type(); + work_view_int_t diag_kernel_type = thandle.get_diag_kernel_type(); - integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); - integer_view_host_t diag_kernel_type_host = + integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); + integer_view_host_t diag_kernel_type_host = thandle.get_diag_kernel_type_host(); - // workspaces - integer_view_t work_offset = thandle.get_work_offset(); - integer_view_host_t work_offset_host = thandle.get_work_offset_host(); - auto work = thandle.get_workspace(); + // workspaces + work_view_int_t work_offset = thandle.get_work_offset(); + integer_view_host_t work_offset_host = thandle.get_work_offset_host(); + auto work = thandle.get_workspace(); #endif - size_type node_count = 0; + size_type node_count = 0; #ifdef profile_supernodal_etree - Kokkos::Timer sptrsv_timer; - sptrsv_timer.reset(); + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); #endif - for (size_type lvl = 0; lvl < nlevels; ++lvl) { - { + for (size_type lvl = 0; lvl < nlevels; ++lvl) { size_type lvl_nodes = hnodes_per_level(lvl); if (lvl_nodes != 0) { @@ -2999,27 +2742,24 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( - Kokkos::RangePolicy(space, node_count, - node_count + lvl_nodes), + range_policy(space, node_count, node_count + lvl_nodes), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedRPSolverFunctor( + ValuesType, LHSType, RHSType>( row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - using team_policy_t = Kokkos::TeamPolicy; int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor + LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count); #else LowerTriLvlSchedTP1SolverFunctor + LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); #endif @@ -3027,14 +2767,14 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, Kokkos::parallel_for( "parfor_l_team", Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); else Kokkos::parallel_for( "parfor_l_team", Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, team_size), + team_policy(space, lvl_nodes, team_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } @@ -3070,10 +2810,10 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, + LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count, vector_size, 0); #else LowerTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, + LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups); #endif Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size @@ -3091,7 +2831,6 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #endif // NOTE: we currently supports only default_layout = LayoutLeft - using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = Kokkos::View; @@ -3103,13 +2842,13 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, if (invert_diagonal && !invert_offdiagonal) { // copy diagonals to workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); } @@ -3194,13 +2933,13 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, if (invert_offdiagonal) { // copy diagonals from workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); } @@ -3208,7 +2947,7 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, // launching sparse-triangular solve functor LowerTriSupernodalFunctor + ValuesType, LHSType> sptrsv_functor(unit_diagonal, invert_diagonal, invert_offdiagonal, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, @@ -3216,7 +2955,7 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_functor); @@ -3238,7 +2977,6 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; // update with spmv (one or two SpMV) bool transpose_spmv = @@ -3250,25 +2988,25 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, auto digmat = thandle.get_diagblock(lvl); KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); } @@ -3278,13 +3016,13 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace - SparseTriSupernodalSpMVFunctor + SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_finalize_functor); @@ -3303,164 +3041,155 @@ static void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, cudaProfilerStop(); #endif } // end if - } // scope for if-block - - } // end for lvl + } // end for lvl #ifdef profile_supernodal_etree - Kokkos::fence(); - double sptrsv_time_seconds = sptrsv_timer.seconds(); - std::cout << " + Execution space : " << execution_space::name() - << std::endl; - std::cout << " + Memory space : " << memory_space::name() << std::endl; - std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl - << std::endl; + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds(); + std::cout << " + Execution space : " << execution_space::name() + << std::endl; + std::cout << " + Memory space : " << temp_mem_space::name() << std::endl; + std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl + << std::endl; #endif -} // end lower_tri_solve + } // end lower_tri_solve -template -static void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { + template + static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStop(); + cudaProfilerStop(); #endif - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - using device_t = Kokkos::Device; - typedef typename TriSolveHandle::size_type size_type; - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - - auto nlevels = thandle.get_num_levels(); - // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); - // Kokkos::deep_copy(hnodes_per_level, nodes_per_level); - - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + using device_t = Kokkos::Device; + + auto nlevels = thandle.get_num_levels(); + // Keep this a host View, create device version and copy to back to host + // during scheduling This requires making sure the host view in the handle is + // properly updated after the symbolic phase + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); + // Kokkos::deep_copy(hnodes_per_level, nodes_per_level); + + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - using namespace KokkosSparse::Experimental; - using integer_view_t = typename TriSolveHandle::integer_view_t; - using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; - using scalar_t = typename ValuesType::non_const_value_type; - using range_type = Kokkos::pair; - using row_map_host_view_t = Kokkos::View; + using namespace KokkosSparse::Experimental; + using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; + using scalar_t = typename ValuesType::non_const_value_type; + using row_map_host_view_t = Kokkos::View; - row_map_host_view_t row_map_host; + row_map_host_view_t row_map_host; - const scalar_t zero(0.0); - const scalar_t one(1.0); + const scalar_t zero(0.0); + const scalar_t one(1.0); - auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); - if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { - Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); + if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - row_map_host = row_map_host_view_t( + row_map_host = row_map_host_view_t( Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), row_map.extent(0)); - Kokkos::deep_copy(row_map_host, row_map); - } + Kokkos::deep_copy(row_map_host, row_map); + } - // supernode sizes - const int *supercols = thandle.get_supercols(); - const int *supercols_host = thandle.get_supercols_host(); + // supernode sizes + const int *supercols = thandle.get_supercols(); + const int *supercols_host = thandle.get_supercols_host(); - // inversion option - const bool invert_diagonal = thandle.get_invert_diagonal(); - const bool invert_offdiagonal = thandle.get_invert_offdiagonal(); + // inversion option + const bool invert_diagonal = thandle.get_invert_diagonal(); + const bool invert_offdiagonal = thandle.get_invert_offdiagonal(); - // kernel types - integer_view_t kernel_type = thandle.get_kernel_type(); - integer_view_t diag_kernel_type = thandle.get_diag_kernel_type(); + // kernel types + work_view_int_t kernel_type = thandle.get_kernel_type(); + work_view_int_t diag_kernel_type = thandle.get_diag_kernel_type(); - integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); - integer_view_host_t diag_kernel_type_host = + integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); + integer_view_host_t diag_kernel_type_host = thandle.get_diag_kernel_type_host(); - // workspace - integer_view_t work_offset = thandle.get_work_offset(); - integer_view_host_t work_offset_host = thandle.get_work_offset_host(); - auto work = thandle.get_workspace(); + // workspace + work_view_int_t work_offset = thandle.get_work_offset(); + integer_view_host_t work_offset_host = thandle.get_work_offset_host(); + auto work = thandle.get_workspace(); #endif - size_type node_count = 0; + size_type node_count = 0; -// This must stay serial; would be nice to try out Cuda's graph stuff to reduce -// kernel launch overhead + // This must stay serial; would be nice to try out Cuda's graph stuff to reduce + // kernel launch overhead #ifdef profile_supernodal_etree - Kokkos::Timer sptrsv_timer; - sptrsv_timer.reset(); + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); #endif - for (size_type lvl = 0; lvl < nlevels; ++lvl) { - size_type lvl_nodes = hnodes_per_level(lvl); + for (size_type lvl = 0; lvl < nlevels; ++lvl) { + size_type lvl_nodes = hnodes_per_level(lvl); - if (lvl_nodes != 0) { + if (lvl_nodes != 0) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStart(); + cudaProfilerStart(); #endif - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( - Kokkos::RangePolicy(space, node_count, - node_count + lvl_nodes), + range_policy(space, node_count, node_count + lvl_nodes), Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedRPSolverFunctor( + LHSType, RHSType>( row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - using team_policy_t = Kokkos::TeamPolicy; + } else if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - int team_size = thandle.get_team_size(); + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count); #else - UpperTriLvlSchedTP1SolverFunctor + UpperTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); #endif - if (team_size == -1) - Kokkos::parallel_for( + if (team_size == -1) + Kokkos::parallel_for( "parfor_u_team", Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); - else - Kokkos::parallel_for( + else + Kokkos::parallel_for( "parfor_u_team", Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, team_size), + team_policy(space, lvl_nodes, team_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); - } - // TP2 algorithm has issues with some offset-ordinal combo to be addressed - /* - else if ( thandle.get_algorithm() == -KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef -Kokkos::TeamPolicy tvt_policy_type; - - int team_size = thandle.get_team_size(); - if ( team_size == -1 ) { + } + // TP2 algorithm has issues with some offset-ordinal combo to be addressed + /* + else if ( thandle.get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef + Kokkos::TeamPolicy tvt_policy_type; + + int team_size = thandle.get_team_size(); + if ( team_size == -1 ) { team_size = std::is_same< typename -Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : + Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 64; } int vector_size = thandle.get_team_size(); @@ -3480,10 +3209,10 @@ node_group (thread has full ownership of a node) #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, +LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count, vector_size, 0); #else UpperTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, +LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups); #endif Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( @@ -3491,827 +3220,805 @@ nodes_grouped_by_level, node_count, node_groups); #endif tstf); } // end elseif */ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { #ifdef profile_supernodal_etree - size_t flops = 0; - Kokkos::Timer timer; - timer.reset(); + size_t flops = 0; + Kokkos::Timer timer; + timer.reset(); #endif - using team_policy_type = Kokkos::TeamPolicy; - if (thandle.is_column_major()) { // U stored in CSC - if (diag_kernel_type_host(lvl) == 3) { - // using device-level kernels (functor is called to gather the input - // into workspace) - scalar_t *dataU = const_cast(values.data()); + if (thandle.is_column_major()) { // U stored in CSC + if (diag_kernel_type_host(lvl) == 3) { + // using device-level kernels (functor is called to gather the input + // into workspace) + scalar_t *dataU = const_cast(values.data()); - if (invert_diagonal && !invert_offdiagonal) { - // copy diagonals to workspaces - const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + if (invert_diagonal && !invert_offdiagonal) { + // copy diagonals to workspaces + const int *work_offset_data = work_offset.data(); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); - } - for (size_type league_rank = 0; league_rank < lvl_nodes; - league_rank++) { - auto s = nodes_grouped_by_level_host(node_count + league_rank); - - // supernodal column size - int j1 = supercols_host[s]; - int j2 = supercols_host[s + 1]; - int nscol = + } + for (size_type league_rank = 0; league_rank < lvl_nodes; + league_rank++) { + auto s = nodes_grouped_by_level_host(node_count + league_rank); + + // supernodal column size + int j1 = supercols_host[s]; + int j2 = supercols_host[s + 1]; + int nscol = j2 - j1; // number of columns in the s-th supernode column - int i1 = row_map_host(j1); - int i2 = row_map_host(j1 + 1); - int nsrow = i2 - i1; // "total" number of rows in all the - // supernodes (diagonal+off-diagonal) - int nsrow2 = nsrow - nscol; // "total" number of rows in all the - // off-diagonal supernodes + int i1 = row_map_host(j1); + int i2 = row_map_host(j1 + 1); + int nsrow = i2 - i1; // "total" number of rows in all the + // supernodes (diagonal+off-diagonal) + int nsrow2 = nsrow - nscol; // "total" number of rows in all the + // off-diagonal supernodes #ifdef profile_supernodal_etree - flops += 2 * (nscol * nsrow); + flops += 2 * (nscol * nsrow); #endif - // workspace - int workoffset = work_offset_host(s); + // workspace + int workoffset = work_offset_host(s); // create a view for the s-th supernocal block column // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View + Kokkos::View viewU(&dataU[i1], nsrow, nscol); - if (invert_offdiagonal) { - auto Uij = + if (invert_offdiagonal) { + auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); - auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); - auto Z = Kokkos::subview( + auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); + auto Z = Kokkos::subview( work, range_type( workoffset, workoffset + nsrow)); // needed with gemv for update&scatter - KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); - } else { - // extract part of the solution, corresponding to the diagonal - // block - auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); + } else { + // extract part of the solution, corresponding to the diagonal + // block + auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); - // "triangular-solve" to compute Xj - // extract the diagonal block of s-th supernocal column of U - auto Ujj = + // "triangular-solve" to compute Xj + // extract the diagonal block of s-th supernocal column of U + auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); - if (invert_diagonal) { - auto Y = Kokkos::subview( + if (invert_diagonal) { + auto Y = Kokkos::subview( work, range_type( workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv - KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); - } else { - // NOTE: we currently supports only default_layout = - // LayoutLeft - Kokkos::View + KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); + } else { + // NOTE: we currently supports only default_layout = + // LayoutLeft + Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); - } - // update off-diagonal blocks - if (nsrow2 > 0) { - // extract the off-diagonal blocks of s-th supernodal column - // of U - auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), - Kokkos::ALL()); - auto Z = Kokkos::subview( + KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); + } + // update off-diagonal blocks + if (nsrow2 > 0) { + // extract the off-diagonal blocks of s-th supernodal column + // of U + auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), + Kokkos::ALL()); + auto Z = Kokkos::subview( work, range_type( workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); + } } } - } - if (invert_offdiagonal) { - // copy diagonals from workspaces - const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + if (invert_offdiagonal) { + // copy diagonals from workspaces + const int *work_offset_data = work_offset.data(); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); + } } - } - // launching sparse-triangular solve functor - UpperTriTranSupernodalFunctor + // launching sparse-triangular solve functor + UpperTriTranSupernodalFunctor sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - using team_policy_t = Kokkos::TeamPolicy; - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_usolve_tran_supernode", Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_functor); - } else { // U stored in CSR - // launching sparse-triangular solve functor - UpperTriSupernodalFunctor + } else { // U stored in CSR + // launching sparse-triangular solve functor + UpperTriSupernodalFunctor sptrsv_functor(invert_diagonal, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - using team_policy_t = Kokkos::TeamPolicy; - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_usolve_supernode", Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_functor); - if (diag_kernel_type_host(lvl) == 3) { - // using device-level kernels (functor is called to gather the input - // into workspace) - scalar_t *dataU = const_cast(values.data()); + if (diag_kernel_type_host(lvl) == 3) { + // using device-level kernels (functor is called to gather the input + // into workspace) + scalar_t *dataU = const_cast(values.data()); - for (size_type league_rank = 0; league_rank < lvl_nodes; - league_rank++) { - auto s = nodes_grouped_by_level_host(node_count + league_rank); + for (size_type league_rank = 0; league_rank < lvl_nodes; + league_rank++) { + auto s = nodes_grouped_by_level_host(node_count + league_rank); - // supernodal column size - int j1 = supercols_host[s]; - int j2 = supercols_host[s + 1]; - int nscol = + // supernodal column size + int j1 = supercols_host[s]; + int j2 = supercols_host[s + 1]; + int nscol = j2 - j1; // number of columns in the s-th supernode column - // "total" number of rows in all the supernodes - // (diagonal+off-diagonal) - int i1 = row_map_host(j1); - int i2 = row_map_host(j1 + 1); - int nsrow = i2 - i1; - // "total" number of rows in all the off-diagonal supernodes - int nsrow2 = nsrow - nscol; + // "total" number of rows in all the supernodes + // (diagonal+off-diagonal) + int i1 = row_map_host(j1); + int i2 = row_map_host(j1 + 1); + int nsrow = i2 - i1; + // "total" number of rows in all the off-diagonal supernodes + int nsrow2 = nsrow - nscol; - // workspace - int workoffset = work_offset_host(s); + // workspace + int workoffset = work_offset_host(s); - // create a view for the s-th supernocal block column - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View + // create a view for the s-th supernocal block column + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View viewU(&dataU[i1], nsrow, nscol); - // extract part of the solution, corresponding to the diagonal - // block - auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); - auto Y = Kokkos::subview( + // extract part of the solution, corresponding to the diagonal + // block + auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); + auto Y = Kokkos::subview( work, range_type( workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv - // update with off-diagonal blocks - if (nsrow2 > 0) { - // extract the off-diagonal blocks of s-th supernodal column of - // U - auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), - Kokkos::ALL()); - auto Z = Kokkos::subview( + // update with off-diagonal blocks + if (nsrow2 > 0) { + // extract the off-diagonal blocks of s-th supernodal column of + // U + auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), + Kokkos::ALL()); + auto Z = Kokkos::subview( work, range_type( workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); - } + KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); + } - // "triangular-solve" to compute Xj - // extract the diagonal block of s-th supernocal column of U - auto Ujj = + // "triangular-solve" to compute Xj + // extract the diagonal block of s-th supernocal column of U + auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); - if (invert_diagonal) { - KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); - } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View + if (invert_diagonal) { + KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); + } else { + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); + } } - } - if (invert_diagonal) { - // copy diagonals from workspaces - const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor + if (invert_diagonal) { + // copy diagonals from workspaces + const int *work_offset_data = work_offset.data(); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); + } } } - } #ifdef profile_supernodal_etree - Kokkos::fence(); - double time_seconds = timer.seconds(); - std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds - << " flop count: " << flops - << " kernel-type: " << kernel_type_host(lvl) - << " # of supernodes: " << lvl_nodes << std::endl; + Kokkos::fence(); + double time_seconds = timer.seconds(); + std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds + << " flop count: " << flops + << " kernel-type: " << kernel_type_host(lvl) + << " # of supernodes: " << lvl_nodes << std::endl; #endif - } else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV || - thandle.get_algorithm() == - SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { + } else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV || + thandle.get_algorithm() == + SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { #ifdef profile_supernodal_etree - Kokkos::Timer timer; - timer.reset(); + Kokkos::Timer timer; + timer.reset(); #endif - // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + // initialize input & output vectors - // update with one, or two, spmv - bool transpose_spmv = + // update with one, or two, spmv + bool transpose_spmv = ((!thandle.transpose_spmv() && thandle.is_column_major()) || (thandle.transpose_spmv() && !thandle.is_column_major())); - const char *tran = (transpose_spmv ? "T" : "N"); - if (!transpose_spmv) { // U stored in CSR - if (!invert_offdiagonal) { - // solve with diagonals - auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); - // copy from work to lhs corresponding to diagonal blocks - SparseTriSupernodalSpMVFunctor + const char *tran = (transpose_spmv ? "T" : "N"); + if (!transpose_spmv) { // U stored in CSR + if (!invert_offdiagonal) { + // solve with diagonals + auto digmat = thandle.get_diagblock(lvl); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); + // copy from work to lhs corresponding to diagonal blocks + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); - } else { - // zero out lhs corresponding to diagonal blocks in lhs, and copy to - // work - SparseTriSupernodalSpMVFunctor + } else { + // zero out lhs corresponding to diagonal blocks in lhs, and copy to + // work + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); - } - // update with off-diagonals (potentiall combined with diagonal - // solves) - auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); - } else { - if (!invert_offdiagonal) { - // zero out lhs corresponding to diagonal blocks in lhs, and copy to - // work - SparseTriSupernodalSpMVFunctor + } + // update with off-diagonals (potentiall combined with diagonal + // solves) + auto submat = thandle.get_submatrix(lvl); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); + } else { + if (!invert_offdiagonal) { + // zero out lhs corresponding to diagonal blocks in lhs, and copy to + // work + SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); - // update with off-diagonals - auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(space, tran, one, submat, lhs, one, work); + // update with off-diagonals + auto submat = thandle.get_submatrix(lvl); + KokkosSparse::spmv(space, tran, one, submat, lhs, one, work); - // solve with diagonals - auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); - } else { - std::cout << " ** invert_offdiag with U in CSR not supported **" - << std::endl; + // solve with diagonals + auto digmat = thandle.get_diagblock(lvl); + KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); + } else { + std::cout << " ** invert_offdiag with U in CSR not supported **" + << std::endl; + } } - } - // reinitialize workspace - SparseTriSupernodalSpMVFunctor + // reinitialize workspace + SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_finalize_functor); #ifdef profile_supernodal_etree - Kokkos::fence(); - double time_seconds = timer.seconds(); - std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds - << " kernel-type: " << kernel_type_host(lvl) - << " # of supernodes: " << lvl_nodes << std::endl; + Kokkos::fence(); + double time_seconds = timer.seconds(); + std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds + << " kernel-type: " << kernel_type_host(lvl) + << " # of supernodes: " << lvl_nodes << std::endl; #endif - } + } #endif - node_count += lvl_nodes; + node_count += lvl_nodes; #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStop(); + cudaProfilerStop(); #endif - } // end if - } // end for lvl + } // end if + } // end for lvl #ifdef profile_supernodal_etree - Kokkos::fence(); - double sptrsv_time_seconds = sptrsv_timer.seconds(); - std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl - << std::endl; - std::cout << " + Execution space : " << ExecutionSpace::name() - << std::endl; - std::cout << " + Memory space : " << memory_space::name() << std::endl; + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds(); + std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl + << std::endl; + std::cout << " + Execution space : " << execution_space::name() + << std::endl; + std::cout << " + Memory space : " << temp_mem_space::name() << std::endl; #endif -} // end upper_tri_solve + } // end upper_tri_solve -template -static void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, - const ValuesType values, const RHSType &rhs, LHSType &lhs, - const bool /*is_lowertri_*/) { + template +static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, LHSType &lhs, + const bool /*is_lowertri_*/) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStop(); + cudaProfilerStop(); #endif - typedef typename TriSolveHandle::size_type size_type; - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; + // Algorithm is checked before this function is called + auto h_chain_ptr = thandle.get_host_chain_ptr(); + size_type num_chain_entries = thandle.get_num_chain_entries(); - // Algorithm is checked before this function is called - auto h_chain_ptr = thandle.get_host_chain_ptr(); - size_type num_chain_entries = thandle.get_num_chain_entries(); + // Keep this a host View, create device version and copy to back to host + // during scheduling This requires making sure the host view in the handle is + // properly updated after the symbolic phase + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); - // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + const bool is_lowertri = thandle.is_lower_tri(); - const bool is_lowertri = thandle.is_lower_tri(); - - size_type node_count = 0; + size_type node_count = 0; // REFACTORED to cleanup; next, need debug and timer routines - using policy_type = Kokkos::TeamPolicy; - using large_cutoff_policy_type = - Kokkos::TeamPolicy; + using large_cutoff_policy_type = + Kokkos::TeamPolicy; /* using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = + ValuesType, LHSType, RHSType>; using LTP1Functor = LowerTriLvlSchedTP1SolverFunctor; using UTP1Functor = + LHSType, RHSType>; using UTP1Functor = UpperTriLvlSchedTP1SolverFunctor; using LSingleBlockFunctor = + LHSType, RHSType>; using LSingleBlockFunctor = LowerTriLvlSchedTP1SingleBlockFunctor; using USingleBlockFunctor = + LHSType, RHSType>; using USingleBlockFunctor = UpperTriLvlSchedTP1SingleBlockFunctor; + LHSType, RHSType>; */ - using SingleBlockFunctor = + using SingleBlockFunctor = TriLvlSchedTP1SingleBlockFunctor; + LHSType, RHSType>; - int team_size = thandle.get_team_size(); - int vector_size = + int team_size = thandle.get_team_size(); + int vector_size = thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1; - auto cutoff = thandle.get_chain_threshold(); - int team_size_singleblock = team_size; - - // Enumerate options - // ts -1,0 | cu 0 - select default ts == 1 - // ts -1,0 | cu > 0 - select default ts; restriction: ts <= tsmax (auto) - // ts > 0 | cu 0 - set - // ts > 0 | cu > 0 - set - // Controls ts,cu > 0 - // co > ts - not all rows can be mapped to a thread - must call largercutoff - // impl co <= ts - okay, kernel must be careful not to access out-of-bounds; - // some threads idol - if (team_size_singleblock <= 0 && cutoff == 0) { - team_size_singleblock = 1; - // If cutoff == 0, no single-block calls will be made, team_size_singleblock - // is unimportant - } + auto cutoff = thandle.get_chain_threshold(); + int team_size_singleblock = team_size; + + // Enumerate options + // ts -1,0 | cu 0 - select default ts == 1 + // ts -1,0 | cu > 0 - select default ts; restriction: ts <= tsmax (auto) + // ts > 0 | cu 0 - set + // ts > 0 | cu > 0 - set + // Controls ts,cu > 0 + // co > ts - not all rows can be mapped to a thread - must call largercutoff + // impl co <= ts - okay, kernel must be careful not to access out-of-bounds; + // some threads idol + if (team_size_singleblock <= 0 && cutoff == 0) { + team_size_singleblock = 1; + // If cutoff == 0, no single-block calls will be made, team_size_singleblock + // is unimportant + } - // This is only necessary for Lower,UpperTri functor versions; else, - // is_lowertri can be passed as arg to the generic Tri functor... - if (is_lowertri) { - for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { - size_type schain = h_chain_ptr(chainlink); - size_type echain = h_chain_ptr(chainlink + 1); + // This is only necessary for Lower,UpperTri functor versions; else, + // is_lowertri can be passed as arg to the generic Tri functor... + if (is_lowertri) { + for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { + size_type schain = h_chain_ptr(chainlink); + size_type echain = h_chain_ptr(chainlink + 1); - if (echain - schain == 1) { - // if team_size is -1 (unset), get recommended size from Kokkos + if (echain - schain == 1) { + // if team_size is -1 (unset), get recommended size from Kokkos #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count); #else - LowerTriLvlSchedTP1SolverFunctor + LowerTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); #endif - if (team_size == -1) { - team_size = - policy_type(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); - } + if (team_size == -1) { + team_size = + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); + } - size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for( + size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? + Kokkos::parallel_for( "parfor_l_team_chain1", Kokkos::Experimental::require( - policy_type(space, lvl_nodes, team_size, vector_size), + team_policy(space, lvl_nodes, team_size, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); - node_count += lvl_nodes; + node_count += lvl_nodes; - } else { - size_type lvl_nodes = 0; + } else { + size_type lvl_nodes = 0; - for (size_type i = schain; i < echain; ++i) { - lvl_nodes += hnodes_per_level(i); - } + for (size_type i = schain; i < echain; ++i) { + lvl_nodes += hnodes_per_level(i); + } - if (team_size_singleblock <= 0) { - team_size_singleblock = - policy_type(space, 1, 1, vector_size) + if (team_size_singleblock <= 0) { + team_size_singleblock = + team_policy(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri), Kokkos::ParallelForTag()); - } + } - if (cutoff <= team_size_singleblock) { + if (cutoff <= team_size_singleblock) { #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor + TriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, true); #else - LowerTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + LowerTriLvlSchedTP1SingleBlockFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain); #endif - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_l_team_chainmulti", Kokkos::Experimental::require( - policy_type(space, 1, team_size_singleblock, vector_size), + team_policy(space, 1, team_size_singleblock, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); - } else { - // team_size_singleblock < cutoff => kernel must allow for a - // block-stride internally + } else { + // team_size_singleblock < cutoff => kernel must allow for a + // block-stride internally #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor + TriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, true, 0, cutoff); #else - LowerTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + LowerTriLvlSchedTP1SingleBlockFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, cutoff); #endif - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_l_team_chainmulti_cutoff", Kokkos::Experimental::require( large_cutoff_policy_type(1, team_size_singleblock, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); + } + node_count += lvl_nodes; } - node_count += lvl_nodes; + // TODO: space.fence() + Kokkos::fence(); // TODO - is this necessary? that is, can the + // parallel_for launch before the s/echain values have + // been updated? } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? - } - } else { - for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { - size_type schain = h_chain_ptr(chainlink); - size_type echain = h_chain_ptr(chainlink + 1); + } else { + for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { + size_type schain = h_chain_ptr(chainlink); + size_type echain = h_chain_ptr(chainlink + 1); - if (echain - schain == 1) { - // if team_size is -1 (unset), get recommended size from Kokkos + if (echain - schain == 1) { + // if team_size is -1 (unset), get recommended size from Kokkos #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count); #else - UpperTriLvlSchedTP1SolverFunctor + UpperTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); #endif - if (team_size == -1) { - team_size = - policy_type(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); - } + if (team_size == -1) { + team_size = + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); + } - // TODO To use cudagraph here, need to know how many non-unit chains - // there are, create a graph for each and launch accordingly - size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for( + // TODO To use cudagraph here, need to know how many non-unit chains + // there are, create a graph for each and launch accordingly + size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? + Kokkos::parallel_for( "parfor_u_team_chain1", Kokkos::Experimental::require( - policy_type(space, lvl_nodes, team_size, vector_size), + team_policy(space, lvl_nodes, team_size, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); - node_count += lvl_nodes; + node_count += lvl_nodes; - } else { - size_type lvl_nodes = 0; + } else { + size_type lvl_nodes = 0; - for (size_type i = schain; i < echain; ++i) { - lvl_nodes += hnodes_per_level(i); - } + for (size_type i = schain; i < echain; ++i) { + lvl_nodes += hnodes_per_level(i); + } - if (team_size_singleblock <= 0) { - // team_size_singleblock = policy_type(1, 1, - // 1).team_size_recommended(SingleBlockFunctor(row_map, entries, - // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), - // Kokkos::ParallelForTag()); - team_size_singleblock = - policy_type(space, 1, 1, vector_size) + if (team_size_singleblock <= 0) { + // team_size_singleblock = team_policy(1, 1, + // 1).team_size_recommended(SingleBlockFunctor(row_map, entries, + // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), + // Kokkos::ParallelForTag()); + team_size_singleblock = + team_policy(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri), Kokkos::ParallelForTag()); - } + } - if (cutoff <= team_size_singleblock) { + if (cutoff <= team_size_singleblock) { #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor + TriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri); #else - UpperTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + UpperTriLvlSchedTP1SingleBlockFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain); #endif - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_u_team_chainmulti", Kokkos::Experimental::require( - policy_type(space, 1, team_size_singleblock, vector_size), + team_policy(space, 1, team_size_singleblock, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); - } else { - // team_size_singleblock < cutoff => kernel must allow for a - // block-stride internally + } else { + // team_size_singleblock < cutoff => kernel must allow for a + // block-stride internally #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor + TriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri, 0, cutoff); #else - UpperTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + UpperTriLvlSchedTP1SingleBlockFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, cutoff); #endif - Kokkos::parallel_for( + Kokkos::parallel_for( "parfor_u_team_chainmulti_cutoff", Kokkos::Experimental::require( large_cutoff_policy_type(1, team_size_singleblock, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); + } + node_count += lvl_nodes; } - node_count += lvl_nodes; + // TODO: space.fence() + Kokkos::fence(); // TODO - is this necessary? that is, can the + // parallel_for launch before the s/echain values have + // been updated? } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? } - } - -} // end tri_solve_chain - -// -------------------------------- -// Stream interfaces -// -------------------------------- - -template -static void lower_tri_solve_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, - std::vector &lhs_v) { - // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment - using size_type = typename TriSolveHandle::size_type; - using NGBLType = typename TriSolveHandle::nnz_lno_view_t; - using nodes_per_level_type = + } // end tri_solve_chain + + // -------------------------------- + // Stream interfaces + // -------------------------------- + template + static void lower_tri_solve_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v) { + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment + using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; - using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - - // Create vectors for handles' data in streams - int nstreams = execspace_v.size(); - std::vector nlevels_v(nstreams); - std::vector hnodes_per_level_v(nstreams); - std::vector nodes_grouped_by_level_v(nstreams); - std::vector node_count_v(nstreams); - - // Retrieve data from handles and find max. number of levels among streams - size_type nlevels_max = 0; - for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); - nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); - node_count_v[i] = 0; - if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; - } + using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector hnodes_per_level_v(nstreams); + std::vector nodes_grouped_by_level_v(nstreams); + std::vector node_count_v(nstreams); - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // 1. Launch work on all streams + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; for (int i = 0; i < nstreams; i++) { - // Only if stream i-th still has this level - if (lvl < nlevels_v[i]) { - size_type lvl_nodes = hnodes_per_level_v[i](lvl); - if (lvl_nodes != 0) { - if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); + node_count_v[i] = 0; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Only if stream i-th still has this level + if (lvl < nlevels_v[i]) { + size_type lvl_nodes = hnodes_per_level_v[i](lvl); + if (lvl_nodes != 0) { + if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy( - execspace_v[i], node_count_v[i], - node_count_v[i] + lvl_nodes), + range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), LowerTriLvlSchedRPSolverFunctor( + ValuesType, LHSType, RHSType>( row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); - } else if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm:: - SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; - int team_size = thandle_v[i]->get_team_size(); + } else if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor + TriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], true, node_count_v[i]); #else - LowerTriLvlSchedTP1SolverFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + LowerTriLvlSchedTP1SolverFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType> tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif - if (team_size == -1) - Kokkos::parallel_for( + if (team_size == -1) + Kokkos::parallel_for( "parfor_l_team", - policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for( + team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for( "parfor_l_team", - policy_type(execspace_v[i], lvl_nodes, team_size), tstf); - } - node_count_v[i] += lvl_nodes; - } // end if (lvl_nodes != 0) - } // end if (lvl < nlevels_v[i]) - } // end for streams - } // end for lvl -} // end lower_tri_solve_streams - -template -static void upper_tri_solve_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, - std::vector &lhs_v) { - // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment - using size_type = typename TriSolveHandle::size_type; - using NGBLType = typename TriSolveHandle::nnz_lno_view_t; - using nodes_per_level_type = + team_policy(execspace_v[i], lvl_nodes, team_size), tstf); + } + node_count_v[i] += lvl_nodes; + } // end if (lvl_nodes != 0) + } // end if (lvl < nlevels_v[i]) + } // end for streams + } // end for lvl + } // end lower_tri_solve_streams + + template + static void upper_tri_solve_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v) { + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment + using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; - using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - - // Create vectors for handles' data in streams - int nstreams = execspace_v.size(); - std::vector nlevels_v(nstreams); - std::vector hnodes_per_level_v(nstreams); - std::vector nodes_grouped_by_level_v(nstreams); - std::vector node_count_v(nstreams); - - // Retrieve data from handles and find max. number of levels among streams - size_type nlevels_max = 0; - for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); - nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); - node_count_v[i] = 0; - if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; - } + using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector hnodes_per_level_v(nstreams); + std::vector nodes_grouped_by_level_v(nstreams); + std::vector node_count_v(nstreams); - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // 1. Launch work on all streams + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; for (int i = 0; i < nstreams; i++) { - // Only if stream i-th still has this level - if (lvl < nlevels_v[i]) { - size_type lvl_nodes = hnodes_per_level_v[i](lvl); - if (lvl_nodes != 0) { - if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); + node_count_v[i] = 0; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Only if stream i-th still has this level + if (lvl < nlevels_v[i]) { + size_type lvl_nodes = hnodes_per_level_v[i](lvl); + if (lvl_nodes != 0) { + if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy( - execspace_v[i], node_count_v[i], - node_count_v[i] + lvl_nodes), + range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), UpperTriLvlSchedRPSolverFunctor( + ValuesType, LHSType, RHSType>( row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); - } else if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm:: - SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; - int team_size = thandle_v[i]->get_team_size(); + } else if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor + TriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], false, node_count_v[i]); #else - UpperTriLvlSchedTP1SolverFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + UpperTriLvlSchedTP1SolverFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType> tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif - if (team_size == -1) - Kokkos::parallel_for( + if (team_size == -1) + Kokkos::parallel_for( "parfor_l_team", - policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for( + team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for( "parfor_l_team", - policy_type(execspace_v[i], lvl_nodes, team_size), tstf); + team_policy(execspace_v[i], lvl_nodes, team_size), tstf); } - node_count_v[i] += lvl_nodes; - } // end if (lvl_nodes != 0) - } // end if (lvl < nlevels_v[i]) - } // end for streams - } // end for lvl -} // end upper_tri_solve_streams + node_count_v[i] += lvl_nodes; + } // end if (lvl_nodes != 0) + } // end if (lvl < nlevels_v[i]) + } // end for streams + } // end for lvl + } // end upper_tri_solve_streams }; // struct SptrsvWrap From a19bbd6fc839ae552e07c339869efa6c46483247 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 14 Jun 2024 14:02:59 -0600 Subject: [PATCH 03/41] Fix a couple warnings --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 1e386f43a4..be687b67b5 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2679,7 +2679,6 @@ struct SptrsvWrap { using namespace KokkosSparse::Experimental; using device_t = Kokkos::Device; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; - using scalar_t = typename ValuesType::non_const_value_type; using row_map_host_view_t = Kokkos::View; row_map_host_view_t row_map_host; @@ -3080,7 +3079,6 @@ struct SptrsvWrap { #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; - using scalar_t = typename ValuesType::non_const_value_type; using row_map_host_view_t = Kokkos::View; row_map_host_view_t row_map_host; From 03908a37394c10147459bed4992ec22c5c78a5d2 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 24 Jun 2024 10:55:45 -0600 Subject: [PATCH 04/41] formatting --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 24 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 2451 +++++++++-------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 38 +- sparse/src/KokkosKernels_Handle.hpp | 3 +- sparse/src/KokkosSparse_sptrsv_handle.hpp | 56 +- 5 files changed, 1309 insertions(+), 1263 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 32fa122196..3caa2bcc31 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -47,18 +47,18 @@ struct IlukWrap { // // Useful types // - using execution_space = typename IlukHandle::execution_space; - using memory_space = typename IlukHandle::memory_space; - using lno_t = typename IlukHandle::nnz_lno_t; - using size_type = typename IlukHandle::size_type; - using scalar_t = typename IlukHandle::nnz_scalar_t; - using WorkViewType = typename IlukHandle::work_view_t; - using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; - using LevelViewType = typename IlukHandle::nnz_lno_view_t; - using karith = typename Kokkos::ArithTraits; - using team_policy = typename IlukHandle::TeamPolicy; - using member_type = typename team_policy::member_type; - using range_policy = typename IlukHandle::RangePolicy; + using execution_space = typename IlukHandle::execution_space; + using memory_space = typename IlukHandle::memory_space; + using lno_t = typename IlukHandle::nnz_lno_t; + using size_type = typename IlukHandle::size_type; + using scalar_t = typename IlukHandle::nnz_scalar_t; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; + using LevelViewType = typename IlukHandle::nnz_lno_view_t; + using karith = typename Kokkos::ArithTraits; + using team_policy = typename IlukHandle::TeamPolicy; + using member_type = typename team_policy::member_type; + using range_policy = typename IlukHandle::RangePolicy; static team_policy get_team_policy(const size_type nrows, const int team_size) { diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index be687b67b5..d385a390cd 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -55,7 +55,6 @@ namespace Experimental { template struct SptrsvWrap { - // // Useful types // @@ -68,15 +67,15 @@ struct SptrsvWrap { using row_map_t = typename TriSolveHandle::nnz_row_view_t; using entries_t = typename TriSolveHandle::nnz_lno_view_t; using values_t = typename TriSolveHandle::nnz_scalar_view_t; - using work_view_t = Kokkos::View>; - using work_view_int_t = Kokkos::View>; - using karith = typename Kokkos::ArithTraits; - using team_policy = typename TriSolveHandle::TeamPolicy; - using member_type = typename team_policy::member_type; - using range_policy = typename TriSolveHandle::RangePolicy; - using range_type = Kokkos::pair; + using work_view_t = + Kokkos::View>; + using work_view_int_t = + Kokkos::View>; + using karith = typename Kokkos::ArithTraits; + using team_policy = typename TriSolveHandle::TeamPolicy; + using member_type = typename team_policy::member_type; + using range_policy = typename TriSolveHandle::RangePolicy; + using range_type = Kokkos::pair; // Tag structs struct UnsortedTag {}; @@ -103,8 +102,8 @@ struct SptrsvWrap { // This functor unifies the lower and upper implementations, the hope is the // "is_lowertri" check does not add noticable time on larger problems - template + template struct TriLvlSchedTP1SolverFunctor { RowMapType row_map; EntriesType entries; @@ -123,15 +122,16 @@ struct SptrsvWrap { const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, - const bool &is_lowertri_, const long &node_count_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - is_lowertri(is_lowertri_), - node_count(node_count_) {} + const bool &is_lowertri_, + const long &node_count_) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + is_lowertri(is_lowertri_), + node_count(node_count_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -145,16 +145,16 @@ struct SptrsvWrap { scalar_t diff = scalar_t(0.0); Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); team.team_barrier(); @@ -163,7 +163,7 @@ struct SptrsvWrap { if (my_rank == 0) { // ASSUMPTION: sorted diagonal value located at eoffset - 1 lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); + : (rhs_rowid + diff) / values(soffset); } } @@ -181,17 +181,17 @@ struct SptrsvWrap { auto diag = -1; Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); team.team_barrier(); // At end, finalize rowid == colid @@ -202,8 +202,8 @@ struct SptrsvWrap { } }; - template + template struct TriLvlSchedTP1SolverFunctorDiagValues { RowMapType row_map; EntriesType entries; @@ -219,24 +219,22 @@ struct SptrsvWrap { // offset long dense_nrows; - TriLvlSchedTP1SolverFunctorDiagValues(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, - LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const ValuesType &diagonal_values_, - const bool is_lowertri_, - long node_count_, long dense_nrows_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - diagonal_values(diagonal_values_), - is_lowertri(is_lowertri_), - node_count(node_count_), - dense_nrows(dense_nrows_) {} + TriLvlSchedTP1SolverFunctorDiagValues( + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const ValuesType &diagonal_values_, const bool is_lowertri_, + long node_count_, long dense_nrows_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + diagonal_values(diagonal_values_), + is_lowertri(is_lowertri_), + node_count(node_count_), + dense_nrows(dense_nrows_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -250,15 +248,15 @@ struct SptrsvWrap { scalar_t diff = scalar_t(0.0); Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); team.team_barrier(); @@ -272,8 +270,8 @@ struct SptrsvWrap { } }; - template + template struct TriLvlSchedTP2SolverFunctor { RowMapType row_map; EntriesType entries; @@ -295,16 +293,16 @@ struct SptrsvWrap { const entries_t &nodes_grouped_by_level_, const bool is_lowertri_, long node_count_, long node_groups_ = 0, long dense_nrows_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - is_lowertri(is_lowertri_), - node_count(node_count_), - node_groups(node_groups_), - dense_nrows(dense_nrows_) {} + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + is_lowertri(is_lowertri_), + node_count(node_count_), + node_groups(node_groups_), + dense_nrows(dense_nrows_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -313,31 +311,32 @@ struct SptrsvWrap { size_t nrows = row_map.extent(0) - 1; Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); - } // end if - }); // end TeamThreadRange + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = is_lowertri + ? (rhs_rowid + diff) / values(eoffset - 1) + : (rhs_rowid + diff) / values(soffset); + } // end if + }); // end TeamThreadRange team.team_barrier(); } @@ -349,32 +348,32 @@ struct SptrsvWrap { size_t nrows = row_map.extent(0) - 1; Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + auto diag = -1; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } // end if + }); // end TeamThreadRange team.team_barrier(); } @@ -382,8 +381,8 @@ struct SptrsvWrap { // Lower vs Upper Multi-block Functors - template + template struct LowerTriLvlSchedRPSolverFunctor { RowMapType row_map; EntriesType entries; @@ -397,12 +396,12 @@ struct SptrsvWrap { const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) {} + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_) {} KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { @@ -446,8 +445,8 @@ struct SptrsvWrap { } }; - template + template struct LowerTriLvlSchedTP1SolverFunctor { RowMapType row_map; EntriesType entries; @@ -466,14 +465,14 @@ struct SptrsvWrap { const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_), + node_groups(node_groups_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -487,15 +486,15 @@ struct SptrsvWrap { scalar_t diff = scalar_t(0.0); Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); team.team_barrier(); @@ -521,17 +520,17 @@ struct SptrsvWrap { auto diag = -1; Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); team.team_barrier(); // At end, finalize rowid == colid @@ -545,8 +544,8 @@ struct SptrsvWrap { // FIXME CUDA: This algorithm not working with all integral type combos // In any case, this serves as a skeleton for 3-level hierarchical parallelism // for alg dev - template + template struct LowerTriLvlSchedTP2SolverFunctor { RowMapType row_map; EntriesType entries; @@ -565,14 +564,14 @@ struct SptrsvWrap { const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_), + node_groups(node_groups_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -581,30 +580,30 @@ struct SptrsvWrap { size_t nrows = row_map.extent(0) - 1; Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); - } // end if - }); // end TeamThreadRange + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); + } // end if + }); // end TeamThreadRange team.team_barrier(); } @@ -616,33 +615,33 @@ struct SptrsvWrap { size_t nrows = row_map.extent(0) - 1; Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + auto diag = -1; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } // end if + }); // end TeamThreadRange team.team_barrier(); } @@ -666,15 +665,16 @@ struct SptrsvWrap { // constructor SparseTriSupernodalSpMVFunctor(int flag_, long node_count_, const entries_t &nodes_grouped_by_level_, - const int *supercols_, const int *workoffset_, - LHSType &X_, work_view_t work_) - : flag(flag_), - node_count(node_count_), - nodes_grouped_by_level(nodes_grouped_by_level_), - supercols(supercols_), - workoffset(workoffset_), - X(X_), - work(work_) {} + const int *supercols_, + const int *workoffset_, LHSType &X_, + work_view_t work_) + : flag(flag_), + node_count(node_count_), + nodes_grouped_by_level(nodes_grouped_by_level_), + supercols(supercols_), + workoffset(workoffset_), + X(X_), + work(work_) {} // operator KOKKOS_INLINE_FUNCTION @@ -720,8 +720,7 @@ struct SptrsvWrap { // ----------------------------------------------------------- // Functor for Lower-triangular solve - template + template struct LowerTriSupernodalFunctor { const bool unit_diagonal; const bool invert_diagonal; @@ -746,42 +745,44 @@ struct SptrsvWrap { // constructor LowerTriSupernodalFunctor( // supernode info - const bool unit_diagonal_, const bool invert_diagonal_, - const bool invert_offdiagonal_, const int *supercols_, - // L in CSC - const ColptrView &colptr_, const RowindType &rowind_, - const ValuesType &values_, - // options to pick kernel type - int level_, work_view_int_t &kernel_type_, - work_view_int_t &diag_kernel_type_, - // right-hand-side (input), solution (output) - LHSType &X_, - // workspace - work_view_t work_, work_view_int_t &work_offset_, - // - const entries_t &nodes_grouped_by_level_, long node_count_) - : unit_diagonal(unit_diagonal_), - invert_diagonal(invert_diagonal_), - invert_offdiagonal(invert_offdiagonal_), - supercols(supercols_), - colptr(colptr_), - rowind(rowind_), - values(values_), - level(level_), - kernel_type(kernel_type_), - diag_kernel_type(diag_kernel_type_), - X(X_), - work(work_), - work_offset(work_offset_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_) {} + const bool unit_diagonal_, const bool invert_diagonal_, + const bool invert_offdiagonal_, const int *supercols_, + // L in CSC + const ColptrView &colptr_, const RowindType &rowind_, + const ValuesType &values_, + // options to pick kernel type + int level_, work_view_int_t &kernel_type_, + work_view_int_t &diag_kernel_type_, + // right-hand-side (input), solution (output) + LHSType &X_, + // workspace + work_view_t work_, work_view_int_t &work_offset_, + // + const entries_t &nodes_grouped_by_level_, long node_count_) + : unit_diagonal(unit_diagonal_), + invert_diagonal(invert_diagonal_), + invert_offdiagonal(invert_offdiagonal_), + supercols(supercols_), + colptr(colptr_), + rowind(rowind_), + values(values_), + level(level_), + kernel_type(kernel_type_), + diag_kernel_type(diag_kernel_type_), + X(X_), + work(work_), + work_offset(work_offset_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_) {} // operator KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- + */ /* get inputs */ - /* ---------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- + */ const int league_rank = team.league_rank(); // batch id const int team_size = team.team_size(); const int team_rank = team.team_rank(); @@ -804,7 +805,7 @@ struct SptrsvWrap { scalar_t *dataL = const_cast(values.data()); Kokkos::View - viewL(&dataL[i1], nsrow, nscol); + viewL(&dataL[i1], nsrow, nscol); // extract part of the solution, corresponding to the diagonal block auto Xj = Kokkos::subview(X, range_type(j1, j2)); @@ -812,21 +813,23 @@ struct SptrsvWrap { // workspace const int workoffset = work_offset(s); auto Z = Kokkos::subview( - work, range_type(workoffset + nscol, workoffset + nsrow)); + work, range_type(workoffset + nscol, workoffset + nsrow)); if (diag_kernel_type(level) != 3) { // not a device-level TRSM-solve if (invert_offdiagonal) { // combined TRSM solve with diagonal + GEMV update with off-diagonal auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nsrow)); // needed for gemv instead of trmv/trsv - auto Ljj = Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL()); + work, + range_type( + workoffset, + workoffset + nsrow)); // needed for gemv instead of trmv/trsv + auto Ljj = + Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL()); KokkosBlas::TeamGemv::invoke(team, one, - Ljj, Xj, + Ljj, + Xj, zero, Y); team.team_barrier(); @@ -837,57 +840,56 @@ struct SptrsvWrap { } else { /* TRSM with diagonal block */ // extract diagonal and off-diagonal blocks of L - auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL()); + auto Ljj = + Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { // workspace auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nscol)); // needed for gemv instead of trmv/trsv + work, + range_type(workoffset, + workoffset + + nscol)); // needed for gemv instead of trmv/trsv for (int ii = team_rank; ii < nscol; ii += team_size) { Y(ii) = Xj(ii); } team.team_barrier(); // calling team-level "Unblocked" gemv on small-size diagonal in // KokkosBatched - KokkosBlas::TeamGemv::invoke(team, - one, - Ljj, - Y, - zero, - Xj); + KokkosBlas::TeamGemv< + member_type, KokkosBlas::Trans::NoTranspose, + KokkosBlas::Algo::Gemv::Unblocked>::invoke(team, one, Ljj, Y, + zero, Xj); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View - Xjj(Xj.data(), nscol, 1); + Xjj(Xj.data(), nscol, 1); if (unit_diagonal) { KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Left, - KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, - Xjj); + member_type, KokkosBatched::Side::Left, + KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, + Xjj); } else { KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Left, - KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, - Xjj); + member_type, KokkosBatched::Side::Left, + KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, + Xjj); } } team.team_barrier(); /* GEMM to update with off diagonal blocks */ auto Lij = - Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL()); + Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL()); KokkosBlas::TeamGemv::invoke(team, one, - Lij, Xj, + Lij, + Xj, zero, Z); team.team_barrier(); @@ -897,11 +899,11 @@ struct SptrsvWrap { /* scatter vectors back into X */ int i2 = i1 + nscol; // offset into rowind int nsrow2 = - nsrow - - nscol; // "total" number of rows in all the off-diagonal supernodes + nsrow - + nscol; // "total" number of rows in all the off-diagonal supernodes Kokkos::View> - Xatomic(X.data(), X.extent(0)); + Xatomic(X.data(), X.extent(0)); for (int ii = team_rank; ii < nsrow2; ii += team_size) { int i = rowind(i2 + ii); Xatomic(i) -= Z(ii); @@ -912,13 +914,12 @@ struct SptrsvWrap { // ----------------------------------------------------------- // Functor for Upper-triangular solve in CSR - template + template struct UpperTriSupernodalFunctor { // NOTE: we currently supports only default_layout = LayoutLeft using SupernodeView = - typename Kokkos::View; + typename Kokkos::View; bool invert_diagonal; const int *supercols; @@ -941,39 +942,41 @@ struct SptrsvWrap { // constructor UpperTriSupernodalFunctor( // supernode info - bool invert_diagonal_, const int *supercols_, - // U in CSR - const ColptrType &colptr_, const RowindType &rowind_, - const ValuesType &values_, - // options to pick kernel type - int level_, work_view_int_t &kernel_type_, - work_view_int_t &diag_kernel_type_, - // right-hand-side (input), solution (output) - LHSType &X_, - // workspace - work_view_t &work_, work_view_int_t &work_offset_, - // - const entries_t &nodes_grouped_by_level_, long node_count_) - : invert_diagonal(invert_diagonal_), - supercols(supercols_), - colptr(colptr_), - rowind(rowind_), - values(values_), - level(level_), - kernel_type(kernel_type_), - diag_kernel_type(diag_kernel_type_), - X(X_), - work(work_), - work_offset(work_offset_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_) {} + bool invert_diagonal_, const int *supercols_, + // U in CSR + const ColptrType &colptr_, const RowindType &rowind_, + const ValuesType &values_, + // options to pick kernel type + int level_, work_view_int_t &kernel_type_, + work_view_int_t &diag_kernel_type_, + // right-hand-side (input), solution (output) + LHSType &X_, + // workspace + work_view_t &work_, work_view_int_t &work_offset_, + // + const entries_t &nodes_grouped_by_level_, long node_count_) + : invert_diagonal(invert_diagonal_), + supercols(supercols_), + colptr(colptr_), + rowind(rowind_), + values(values_), + level(level_), + kernel_type(kernel_type_), + diag_kernel_type(diag_kernel_type_), + X(X_), + work(work_), + work_offset(work_offset_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_) {} // operator KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- + */ /* get inputs */ - /* ---------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- + */ const int league_rank = team.league_rank(); // batch id const int team_size = team.team_size(); const int team_rank = team.team_rank(); @@ -1006,9 +1009,9 @@ struct SptrsvWrap { /* gather vector into Z */ int i2 = i1 + nscol; // offset into rowind auto Z = Kokkos::subview( - work, - range_type(workoffset + nscol, - workoffset + nsrow)); // needed with gemv for update&scatter + work, range_type(workoffset + nscol, + workoffset + + nsrow)); // needed with gemv for update&scatter using Z_type = decltype(Z); for (int ii = team_rank; ii < nsrow2; ii += team_size) { int i = rowind(i2 + ii); @@ -1019,12 +1022,12 @@ struct SptrsvWrap { if (diag_kernel_type(level) != 3) { // not device-level GEMV-udpate auto Uij = - Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); + Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); using Uij_type = decltype(Uij); KokkosBlas::TeamGemv:: - template invoke( - team, -one, Uij, Z, one, Xj); + template invoke( + team, -one, Uij, Z, one, Xj); team.team_barrier(); /* TRSM with diagonal block */ @@ -1035,10 +1038,10 @@ struct SptrsvWrap { if (invert_diagonal) { // workspace auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nscol)); // needed for gemv instead of trmv/trsv + work, + range_type( + workoffset, + workoffset + nscol)); // needed for gemv instead of trmv/trsv using Y_type = decltype(Y); for (int ii = team_rank; ii < nscol; ii += team_size) { Y(ii) = Xj(ii); @@ -1048,17 +1051,19 @@ struct SptrsvWrap { // caling team-level kernel in KokkosBatched on a small-size diagonal KokkosBlas::TeamGemv:: - template invoke( - team, one, Ujj, Y, zero, Xj); + template invoke( + team, one, Ujj, Y, zero, Xj); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View - Xjj(Xj.data(), nscol, 1); + Xjj(Xj.data(), nscol, 1); KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, - KokkosBatched::Trans::Transpose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj); + member_type, KokkosBatched::Side::Left, + KokkosBatched::Uplo::Lower, KokkosBatched::Trans::Transpose, + KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, + Xjj); } team.team_barrier(); } @@ -1067,8 +1072,7 @@ struct SptrsvWrap { // ----------------------------------------------------------- // Functor for Upper-triangular solve in CSC - template + template struct UpperTriTranSupernodalFunctor { const bool invert_diagonal; const bool invert_offdiagonal; @@ -1092,42 +1096,44 @@ struct SptrsvWrap { // constructor UpperTriTranSupernodalFunctor( // supernode info - const bool invert_diagonal_, const bool invert_offdiagonal_, - const int *supercols_, - - // U in CSC - const ColptrType &colptr_, const RowindType &rowind_, - const ValuesType &values_, - // options to pick kernel type - const int level_, const work_view_int_t &kernel_type_, - const work_view_int_t &diag_kernel_type_, - // right-hand-side (input), solution (output) - const LHSType &X_, - // workspace - const work_view_t &work_, const work_view_int_t &work_offset_, - // - const entries_t &nodes_grouped_by_level_, const long node_count_) - : invert_diagonal(invert_diagonal_), - invert_offdiagonal(invert_offdiagonal_), - supercols(supercols_), - colptr(colptr_), - rowind(rowind_), - values(values_), - level(level_), - kernel_type(kernel_type_), - diag_kernel_type(diag_kernel_type_), - X(X_), - work(work_), - work_offset(work_offset_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_) {} + const bool invert_diagonal_, const bool invert_offdiagonal_, + const int *supercols_, + + // U in CSC + const ColptrType &colptr_, const RowindType &rowind_, + const ValuesType &values_, + // options to pick kernel type + const int level_, const work_view_int_t &kernel_type_, + const work_view_int_t &diag_kernel_type_, + // right-hand-side (input), solution (output) + const LHSType &X_, + // workspace + const work_view_t &work_, const work_view_int_t &work_offset_, + // + const entries_t &nodes_grouped_by_level_, const long node_count_) + : invert_diagonal(invert_diagonal_), + invert_offdiagonal(invert_offdiagonal_), + supercols(supercols_), + colptr(colptr_), + rowind(rowind_), + values(values_), + level(level_), + kernel_type(kernel_type_), + diag_kernel_type(diag_kernel_type_), + X(X_), + work(work_), + work_offset(work_offset_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_) {} // operator KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- + */ /* get inputs */ - /* ---------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- + */ const int league_rank = team.league_rank(); // batch id const int team_size = team.team_size(); const int team_rank = team.team_rank(); @@ -1151,7 +1157,7 @@ struct SptrsvWrap { scalar_t *dataU = const_cast(values.data()); Kokkos::View - viewU(&dataU[i1], nsrow, nscol); + viewU(&dataU[i1], nsrow, nscol); // extract part of solution, corresponding to the diagonal block U(s, s) auto Xj = Kokkos::subview(X, range_type(j1, j2)); @@ -1165,53 +1171,53 @@ struct SptrsvWrap { if (invert_offdiagonal) { // extract diagonal + off-diagonal blocks of U auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nsrow)); // needed with gemv for update&scatter - auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); + work, + range_type( + workoffset, + workoffset + nsrow)); // needed with gemv for update&scatter + auto Uij = + Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); KokkosBlas::TeamGemv::invoke(team, one, - Uij, Xj, + Uij, + Xj, zero, Y); team.team_barrier(); - // copy the diagonal back to output + // copy the diagonal back to output for (int ii = team_rank; ii < nscol; ii += team_size) { Xj(ii) = Y(ii); } } else { // extract diagonal block of U (stored on top) - auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); + auto Ujj = + Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nscol)); // needed for gemv instead of trmv/trsv + work, + range_type(workoffset, + workoffset + + nscol)); // needed for gemv instead of trmv/trsv for (int ii = team_rank; ii < nscol; ii += team_size) { Y(ii) = Xj(ii); } team.team_barrier(); - KokkosBlas::TeamGemv::invoke(team, - one, - Ujj, - Y, - zero, - Xj); + KokkosBlas::TeamGemv< + member_type, KokkosBatched::Trans::NoTranspose, + KokkosBlas::Algo::Gemv::Unblocked>::invoke(team, one, Ujj, Y, + zero, Xj); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View - Xjj(Xj.data(), nscol, 1); + Xjj(Xj.data(), nscol, 1); KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Left, - KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, - Xjj); + member_type, KokkosBatched::Side::Left, + KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, + Xjj); } } team.team_barrier(); @@ -1219,15 +1225,16 @@ struct SptrsvWrap { if (nsrow2 > 0) { /* GEMM to update off diagonal blocks, Z = Uij * Xj */ auto Z = Kokkos::subview( - work, range_type(workoffset + nscol, workoffset + nsrow)); + work, range_type(workoffset + nscol, workoffset + nsrow)); if (!invert_offdiagonal && diag_kernel_type(level) != 3) { // not device-level TRSM-solve auto Uij = - Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); + Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); KokkosBlas::TeamGemv::invoke(team, one, - Uij, Xj, + Uij, + Xj, zero, Z); team.team_barrier(); @@ -1237,7 +1244,7 @@ struct SptrsvWrap { int i2 = i1 + nscol; // offset into rowind Kokkos::View> - Xatomic(X.data(), X.extent(0)); + Xatomic(X.data(), X.extent(0)); for (int ii = team_rank; ii < nsrow2; ii += team_size) { int i = rowind(i2 + ii); Xatomic(i) -= Z(ii); @@ -1248,8 +1255,8 @@ struct SptrsvWrap { }; #endif - template + template struct UpperTriLvlSchedRPSolverFunctor { RowMapType row_map; EntriesType entries; @@ -1263,12 +1270,12 @@ struct SptrsvWrap { const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) {} + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_) {} KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { @@ -1309,8 +1316,8 @@ struct SptrsvWrap { } }; - template + template struct UpperTriLvlSchedTP1SolverFunctor { RowMapType row_map; EntriesType entries; @@ -1329,14 +1336,14 @@ struct SptrsvWrap { const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_), + node_groups(node_groups_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -1350,15 +1357,15 @@ struct SptrsvWrap { scalar_t diff = scalar_t(0.0); Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); team.team_barrier(); @@ -1384,17 +1391,17 @@ struct SptrsvWrap { auto diag = -1; Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); team.team_barrier(); // At end, finalize rowid == colid @@ -1408,8 +1415,8 @@ struct SptrsvWrap { // FIXME CUDA: This algorithm not working with all integral type combos // In any case, this serves as a skeleton for 3-level hierarchical parallelism // for alg dev - template + template struct UpperTriLvlSchedTP2SolverFunctor { RowMapType row_map; EntriesType entries; @@ -1428,14 +1435,14 @@ struct SptrsvWrap { const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_), + node_groups(node_groups_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -1444,30 +1451,30 @@ struct SptrsvWrap { size_t nrows = row_map.extent(0) - 1; Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at start offset - lhs(rowid) = (rhs_rowid + diff) / values(soffset); - } // end if - }); // end TeamThreadRange + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); + + // ASSUMPTION: sorted diagonal value located at start offset + lhs(rowid) = (rhs_rowid + diff) / values(soffset); + } // end if + }); // end TeamThreadRange team.team_barrier(); } @@ -1479,32 +1486,32 @@ struct SptrsvWrap { size_t nrows = row_map.extent(0) - 1; Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + auto diag = -1; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } // end if + }); // end TeamThreadRange team.team_barrier(); } @@ -1514,8 +1521,8 @@ struct SptrsvWrap { // Single-block functors // -------------------------------- - template + template struct LowerTriLvlSchedTP1SingleBlockFunctor { RowMapType row_map; EntriesType entries; @@ -1532,22 +1539,22 @@ struct SptrsvWrap { long cutoff; // team_size: each team can be assigned a row, if there are enough rows... - LowerTriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - cutoff(cutoff_) {} + LowerTriLvlSchedTP1SingleBlockFunctor( + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, + long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), + node_count(node_count_), + lvl_start(lvl_start_), + lvl_end(lvl_end_), + cutoff(cutoff_) {} // SingleBlock: Only one block (or league) executing; team_rank used to map // thread to row @@ -1584,23 +1591,23 @@ struct SptrsvWrap { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif // ASSUMPTION: sorted diagonal value located at eoffset - 1 lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); } // end if team.team_rank() < nodes_this_lvl { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -1640,25 +1647,25 @@ struct SptrsvWrap { auto diag = -1; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif lhs(rowid) = (rhs_val + diff) / values(diag); } // end if team.team_rank() < nodes_this_lvl { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -1699,25 +1706,25 @@ struct SptrsvWrap { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for + // lower tri, soffset for upper tri lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); } // end if team.team_rank() < nodes_this_lvl } // end for my_rank loop { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -1762,25 +1769,25 @@ struct SptrsvWrap { auto diag = -1; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif lhs(rowid) = (rhs_val + diff) / values(diag); } // end if team.team_rank() < nodes_this_lvl } // end for my_rank loop { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -1788,8 +1795,8 @@ struct SptrsvWrap { } // end tagged operator }; - template + template struct UpperTriLvlSchedTP1SingleBlockFunctor { RowMapType row_map; EntriesType entries; @@ -1807,21 +1814,21 @@ struct SptrsvWrap { // team_size: each team can be assigned a row, if there are enough rows... UpperTriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - cutoff(cutoff_) {} + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, + long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), + node_count(node_count_), + lvl_start(lvl_start_), + lvl_end(lvl_end_), + cutoff(cutoff_) {} // SingleBlock: Only one block (or league) executing; team_rank used to map // thread to row @@ -1857,23 +1864,23 @@ struct SptrsvWrap { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif // ASSUMPTION: sorted diagonal value located at soffset lhs(rowid) = (rhs_val + diff) / values(soffset); } // end if { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // each thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl each thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -1917,24 +1924,24 @@ struct SptrsvWrap { auto diag = -1; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif lhs(rowid) = (rhs_val + diff) / values(diag); } // end if { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // each thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl each thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -1976,25 +1983,25 @@ struct SptrsvWrap { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for + // lower tri, soffset for upper tri lhs(rowid) = (rhs_val + diff) / values(soffset); } // end if team.team_rank() < nodes_this_lvl } // end for my_rank loop { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -2041,25 +2048,25 @@ struct SptrsvWrap { auto trange = eoffset - soffset; auto diag = -1; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif lhs(rowid) = (rhs_val + diff) / values(diag); } // end if team.team_rank() < nodes_this_lvl } // end for my_rank loop { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -2067,8 +2074,8 @@ struct SptrsvWrap { } // end tagged operator }; - template + template struct TriLvlSchedTP1SingleBlockFunctor { RowMapType row_map; EntriesType entries; @@ -2088,24 +2095,24 @@ struct SptrsvWrap { // team_size: each team can be assigned a row, if there are enough rows... TriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, - const int dense_nrows_ = 0, const int cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - is_lowertri(is_lower_), - dense_nrows(dense_nrows_), - cutoff(cutoff_) {} + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, + long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, + const int dense_nrows_ = 0, const int cutoff_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), + node_count(node_count_), + lvl_start(lvl_start_), + lvl_end(lvl_end_), + is_lowertri(is_lower_), + dense_nrows(dense_nrows_), + cutoff(cutoff_) {} // SingleBlock: Only one block (or league) executing; team_rank used to map // thread to row @@ -2142,16 +2149,16 @@ struct SptrsvWrap { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower @@ -2162,8 +2169,8 @@ struct SptrsvWrap { lhs(rowid) = (rhs_val + diff) / values(soffset); } // end if team.team_rank() < nodes_this_lvl { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -2206,24 +2213,24 @@ struct SptrsvWrap { auto trange = eoffset - soffset; auto diag = -1; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif lhs(rowid) = (rhs_val + diff) / values(diag); } // end if team.team_rank() < nodes_this_lvl { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -2265,20 +2272,20 @@ struct SptrsvWrap { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for + // lower tri, soffset for upper tri if (is_lowertri) lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); else @@ -2286,8 +2293,8 @@ struct SptrsvWrap { } // end if team.team_rank() < nodes_this_lvl } // end for my_rank loop { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -2334,25 +2341,25 @@ struct SptrsvWrap { auto trange = eoffset - soffset; auto diag = -1; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif lhs(rowid) = (rhs_val + diff) / values(diag); } // end if team.team_rank() < nodes_this_lvl } // end for my_rank loop { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -2360,8 +2367,8 @@ struct SptrsvWrap { } // end tagged operator }; - template + template struct TriLvlSchedTP1SingleBlockFunctorDiagValues { RowMapType row_map; EntriesType entries; @@ -2382,26 +2389,26 @@ struct SptrsvWrap { // team_size: each team can be assigned a row, if there are enough rows... TriLvlSchedTP1SingleBlockFunctorDiagValues( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, const entries_t &nodes_per_level_, - const ValuesType &diagonal_values_, long node_count_, - const long lvl_start_, const long lvl_end_, const bool is_lower_, - const int dense_nrows_ = 0, const int cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - diagonal_values(diagonal_values_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - is_lowertri(is_lower_), - dense_nrows(dense_nrows_), - cutoff(cutoff_) {} + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const entries_t &nodes_per_level_, const ValuesType &diagonal_values_, + long node_count_, const long lvl_start_, const long lvl_end_, + const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), + diagonal_values(diagonal_values_), + node_count(node_count_), + lvl_start(lvl_start_), + lvl_end(lvl_end_), + is_lowertri(is_lower_), + dense_nrows(dense_nrows_), + cutoff(cutoff_) {} // SingleBlock: Only one block (or league) executing; team_rank used to map // thread to row @@ -2438,25 +2445,25 @@ struct SptrsvWrap { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower // tri, soffset for upper tri lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); } // end if team.team_rank() < nodes_this_lvl { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -2498,23 +2505,23 @@ struct SptrsvWrap { #else auto trange = eoffset - soffset; Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); } // end if team.team_rank() < nodes_this_lvl } // end for my_rank loop { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread mut_node_count += nodes_this_lvl; } team.team_barrier(); @@ -2523,13 +2530,15 @@ struct SptrsvWrap { }; #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT - template - static void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { + template + static void lower_tri_solve_cg(TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = - thandle.get_sptrsvCudaGraph(); + thandle.get_sptrsvCudaGraph(); auto nlevels = thandle.get_num_levels(); @@ -2560,18 +2569,18 @@ struct SptrsvWrap { size_type lvl_nodes = hnodes_per_level(iter); auto policy = std::is_same::value - ? team_policy(lvl_nodes, team_size, cuda1) - : team_policy(lvl_nodes, team_size); + ? team_policy(lvl_nodes, team_size, cuda1) + : team_policy(lvl_nodes, team_size); Kokkos::parallel_for( - "parfor_l_team_cudagraph", - Kokkos::Experimental::require( - policy, - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - LowerTriLvlSchedTP1SolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count)); + "parfor_l_team_cudagraph", + Kokkos::Experimental::require( + policy, + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + LowerTriLvlSchedTP1SolverFunctor( + row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count)); node_count += hnodes_per_level(iter); } @@ -2579,7 +2588,8 @@ struct SptrsvWrap { cudaStreamEndCapture(stream1, &graph); // Create graphExec - cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0); + cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, + NULL, 0); thandle.cudagraphCreated = true; } // Run graph @@ -2590,13 +2600,15 @@ struct SptrsvWrap { Kokkos::fence(); } // end lower_tri_solve_cg - template - static void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { + template + static void upper_tri_solve_cg(TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = - thandle.get_sptrsvCudaGraph(); + thandle.get_sptrsvCudaGraph(); auto nlevels = thandle.get_num_levels(); @@ -2626,18 +2638,18 @@ struct SptrsvWrap { size_type lvl_nodes = hnodes_per_level(iter); auto policy = std::is_same::value - ? team_policy(lvl_nodes, team_size, cuda1) - : team_policy(lvl_nodes, team_size); + ? team_policy(lvl_nodes, team_size, cuda1) + : team_policy(lvl_nodes, team_size); Kokkos::parallel_for( - "parfor_u_team_cudagraph", - Kokkos::Experimental::require( - policy, - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - UpperTriLvlSchedTP1SolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count)); + "parfor_u_team_cudagraph", + Kokkos::Experimental::require( + policy, + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + UpperTriLvlSchedTP1SolverFunctor( + row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count)); node_count += hnodes_per_level(iter); } @@ -2645,7 +2657,8 @@ struct SptrsvWrap { cudaStreamEndCapture(stream1, &graph); // Create graphExec - cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0); + cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, + NULL, 0); thandle.cudagraphCreated = true; } // Run graph @@ -2658,10 +2671,11 @@ struct SptrsvWrap { #endif - template + template static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, + const RowMapType row_map, + const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) @@ -2669,8 +2683,8 @@ struct SptrsvWrap { #endif auto nlevels = thandle.get_num_levels(); // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase + // during scheduling This requires making sure the host view in the handle + // is properly updated after the symbolic phase auto nodes_per_level = thandle.get_nodes_per_level(); auto hnodes_per_level = thandle.get_host_nodes_per_level(); auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); @@ -2686,7 +2700,8 @@ struct SptrsvWrap { const scalar_t zero(0.0); const scalar_t one(1.0); - auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + auto nodes_grouped_by_level_host = + thandle.get_host_nodes_grouped_by_level(); if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || @@ -2694,8 +2709,8 @@ struct SptrsvWrap { Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); row_map_host = row_map_host_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), - row_map.extent(0)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), + row_map.extent(0)); Kokkos::deep_copy(row_map_host, row_map); } @@ -2714,7 +2729,7 @@ struct SptrsvWrap { integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); integer_view_host_t diag_kernel_type_host = - thandle.get_diag_kernel_type_host(); + thandle.get_diag_kernel_type_host(); // workspaces work_view_int_t work_offset = thandle.get_work_offset(); @@ -2749,7 +2764,7 @@ struct SptrsvWrap { } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - int team_size = thandle.get_team_size(); + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor - sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -2, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( @@ -2932,9 +2947,9 @@ struct SptrsvWrap { if (invert_offdiagonal) { // copy diagonals from workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( @@ -2945,8 +2960,8 @@ struct SptrsvWrap { } // launching sparse-triangular solve functor - LowerTriSupernodalFunctor + LowerTriSupernodalFunctor sptrsv_functor(unit_diagonal, invert_diagonal, invert_offdiagonal, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, @@ -2987,9 +3002,9 @@ struct SptrsvWrap { auto digmat = thandle.get_diagblock(lvl); KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( @@ -2999,9 +3014,9 @@ struct SptrsvWrap { } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + 1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( @@ -3015,9 +3030,9 @@ struct SptrsvWrap { KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace - SparseTriSupernodalSpMVFunctor - sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor( + 0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, + work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( @@ -3040,24 +3055,26 @@ struct SptrsvWrap { cudaProfilerStop(); #endif } // end if - } // end for lvl + } // end for lvl #ifdef profile_supernodal_etree Kokkos::fence(); double sptrsv_time_seconds = sptrsv_timer.seconds(); std::cout << " + Execution space : " << execution_space::name() << std::endl; - std::cout << " + Memory space : " << temp_mem_space::name() << std::endl; + std::cout << " + Memory space : " << temp_mem_space::name() + << std::endl; std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl << std::endl; #endif } // end lower_tri_solve - template + template static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, + const RowMapType row_map, + const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) @@ -3067,8 +3084,8 @@ struct SptrsvWrap { auto nlevels = thandle.get_num_levels(); // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase + // during scheduling This requires making sure the host view in the handle + // is properly updated after the symbolic phase auto nodes_per_level = thandle.get_nodes_per_level(); auto hnodes_per_level = thandle.get_host_nodes_per_level(); // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); @@ -3086,7 +3103,8 @@ struct SptrsvWrap { const scalar_t zero(0.0); const scalar_t one(1.0); - auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + auto nodes_grouped_by_level_host = + thandle.get_host_nodes_grouped_by_level(); if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || @@ -3094,8 +3112,8 @@ struct SptrsvWrap { Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); row_map_host = row_map_host_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), - row_map.extent(0)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), + row_map.extent(0)); Kokkos::deep_copy(row_map_host, row_map); } @@ -3113,7 +3131,7 @@ struct SptrsvWrap { integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); integer_view_host_t diag_kernel_type_host = - thandle.get_diag_kernel_type_host(); + thandle.get_diag_kernel_type_host(); // workspace work_view_int_t work_offset = thandle.get_work_offset(); @@ -3123,8 +3141,8 @@ struct SptrsvWrap { size_type node_count = 0; - // This must stay serial; would be nice to try out Cuda's graph stuff to reduce - // kernel launch overhead + // This must stay serial; would be nice to try out Cuda's graph stuff to + // reduce kernel launch overhead #ifdef profile_supernodal_etree Kokkos::Timer sptrsv_timer; sptrsv_timer.reset(); @@ -3140,55 +3158,56 @@ struct SptrsvWrap { if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( - "parfor_fixed_lvl", - Kokkos::Experimental::require( - range_policy(space, node_count, node_count + lvl_nodes), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - UpperTriLvlSchedRPSolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); + "parfor_fixed_lvl", + Kokkos::Experimental::require( + range_policy(space, node_count, node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + UpperTriLvlSchedRPSolverFunctor( + row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - false, node_count); + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + false, node_count); #else UpperTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count); #endif if (team_size == -1) Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_u_team", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); else Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_u_team", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, team_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } - // TP2 algorithm has issues with some offset-ordinal combo to be addressed + // TP2 algorithm has issues with some offset-ordinal combo to be + // addressed /* else if ( thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef - Kokkos::TeamPolicy tvt_policy_type; + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { +typedef Kokkos::TeamPolicy tvt_policy_type; int team_size = thandle.get_team_size(); if ( team_size == -1 ) { team_size = std::is_same< typename - Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : -64; + Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace +>::value ? 1 : 64; } int vector_size = thandle.get_team_size(); if ( vector_size == -1 ) { @@ -3230,22 +3249,23 @@ tstf); } // end elseif if (thandle.is_column_major()) { // U stored in CSC if (diag_kernel_type_host(lvl) == 3) { - // using device-level kernels (functor is called to gather the input - // into workspace) + // using device-level kernels (functor is called to gather the + // input into workspace) scalar_t *dataU = const_cast(values.data()); if (invert_diagonal && !invert_offdiagonal) { // copy diagonals to workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -2, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); Kokkos::parallel_for( - "parfor_tri_supernode_spmv", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty:: + HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) { @@ -3255,14 +3275,14 @@ tstf); } // end elseif int j1 = supercols_host[s]; int j2 = supercols_host[s + 1]; int nscol = - j2 - j1; // number of columns in the s-th supernode column + j2 - j1; // number of columns in the s-th supernode column int i1 = row_map_host(j1); int i2 = row_map_host(j1 + 1); - int nsrow = i2 - i1; // "total" number of rows in all the - // supernodes (diagonal+off-diagonal) - int nsrow2 = nsrow - nscol; // "total" number of rows in all the - // off-diagonal supernodes + int nsrow = i2 - i1; // "total" number of rows in all the + // supernodes (diagonal+off-diagonal) + int nsrow2 = nsrow - nscol; // "total" number of rows in all + // the off-diagonal supernodes #ifdef profile_supernodal_etree flops += 2 * (nscol * nsrow); #endif @@ -3270,22 +3290,22 @@ tstf); } // end elseif // workspace int workoffset = work_offset_host(s); - // create a view for the s-th supernocal block column - // NOTE: we currently supports only default_layout = LayoutLeft + // create a view for the s-th supernocal block column + // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View - viewU(&dataU[i1], nsrow, nscol); + viewU(&dataU[i1], nsrow, nscol); if (invert_offdiagonal) { - auto Uij = - Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); - auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); - auto Z = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + - nsrow)); // needed with gemv for update&scatter + auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), + Kokkos::ALL()); + auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); + auto Z = Kokkos::subview( + work, + range_type( + workoffset, + workoffset + + nsrow)); // needed with gemv for update&scatter KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } else { // extract part of the solution, corresponding to the diagonal @@ -3294,22 +3314,21 @@ tstf); } // end elseif // "triangular-solve" to compute Xj // extract the diagonal block of s-th supernocal column of U - auto Ujj = - Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); + auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), + Kokkos::ALL()); if (invert_diagonal) { auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + - nscol)); // needed for gemv instead of trmv/trsv + work, range_type( + workoffset, + workoffset + nscol)); // needed for gemv + // instead of trmv/trsv KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); } else { // NOTE: we currently supports only default_layout = // LayoutLeft Kokkos::View - Xjj(Xj.data(), nscol, 1); + Xjj(Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); } // update off-diagonal blocks @@ -3319,11 +3338,10 @@ tstf); } // end elseif auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); auto Z = Kokkos::subview( - work, - range_type( - workoffset + nscol, - workoffset + nscol + - nsrow2)); // needed with gemv for update&scatter + work, range_type(workoffset + nscol, + workoffset + nscol + + nsrow2)); // needed with gemv for + // update&scatter KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } } @@ -3331,51 +3349,52 @@ tstf); } // end elseif if (invert_offdiagonal) { // copy diagonals from workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); Kokkos::parallel_for( - "parfor_tri_supernode_spmv", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty:: + HintLightWeight), + sptrsv_init_functor); } } // launching sparse-triangular solve functor - UpperTriTranSupernodalFunctor - sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols, - row_map, entries, values, lvl, kernel_type, - diag_kernel_type, lhs, work, work_offset, - nodes_grouped_by_level, node_count); + UpperTriTranSupernodalFunctor + sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols, + row_map, entries, values, lvl, kernel_type, + diag_kernel_type, lhs, work, work_offset, + nodes_grouped_by_level, node_count); Kokkos::parallel_for( - "parfor_usolve_tran_supernode", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_functor); + "parfor_usolve_tran_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); } else { // U stored in CSR // launching sparse-triangular solve functor - UpperTriSupernodalFunctor - sptrsv_functor(invert_diagonal, supercols, row_map, entries, - values, lvl, kernel_type, diag_kernel_type, lhs, - work, work_offset, nodes_grouped_by_level, - node_count); + UpperTriSupernodalFunctor + sptrsv_functor(invert_diagonal, supercols, row_map, entries, + values, lvl, kernel_type, diag_kernel_type, lhs, + work, work_offset, nodes_grouped_by_level, + node_count); Kokkos::parallel_for( - "parfor_usolve_supernode", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_functor); + "parfor_usolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); if (diag_kernel_type_host(lvl) == 3) { - // using device-level kernels (functor is called to gather the input - // into workspace) + // using device-level kernels (functor is called to gather the + // input into workspace) scalar_t *dataU = const_cast(values.data()); for (size_type league_rank = 0; league_rank < lvl_nodes; @@ -3386,7 +3405,7 @@ tstf); } // end elseif int j1 = supercols_host[s]; int j2 = supercols_host[s + 1]; int nscol = - j2 - j1; // number of columns in the s-th supernode column + j2 - j1; // number of columns in the s-th supernode column // "total" number of rows in all the supernodes // (diagonal+off-diagonal) @@ -3403,59 +3422,62 @@ tstf); } // end elseif // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View - viewU(&dataU[i1], nsrow, nscol); + viewU(&dataU[i1], nsrow, nscol); // extract part of the solution, corresponding to the diagonal // block auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + - nscol)); // needed for gemv instead of trmv/trsv + work, + range_type( + workoffset, + workoffset + + nscol)); // needed for gemv instead of trmv/trsv // update with off-diagonal blocks if (nsrow2 > 0) { - // extract the off-diagonal blocks of s-th supernodal column of + // extract the off-diagonal blocks of s-th supernodal column + // of // U auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); auto Z = Kokkos::subview( - work, - range_type( - workoffset + nscol, - workoffset + nscol + - nsrow2)); // needed with gemv for update&scatter + work, + range_type( + workoffset + nscol, + workoffset + nscol + + nsrow2)); // needed with gemv for update&scatter KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); } // "triangular-solve" to compute Xj // extract the diagonal block of s-th supernocal column of U auto Ujj = - Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); + Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { - // NOTE: we currently supports only default_layout = LayoutLeft + // NOTE: we currently supports only default_layout = + // LayoutLeft Kokkos::View - Xjj(Xj.data(), nscol, 1); + Xjj(Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); } } if (invert_diagonal) { // copy diagonals from workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); Kokkos::parallel_for( - "parfor_tri_supernode_spmv", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty:: + HintLightWeight), + sptrsv_init_functor); } } } @@ -3467,9 +3489,10 @@ tstf); } // end elseif << " kernel-type: " << kernel_type_host(lvl) << " # of supernodes: " << lvl_nodes << std::endl; #endif - } else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV || + } else if (thandle.get_algorithm() == + SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm() == - SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { + SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { #ifdef profile_supernodal_etree Kokkos::Timer timer; timer.reset(); @@ -3479,8 +3502,8 @@ tstf); } // end elseif // update with one, or two, spmv bool transpose_spmv = - ((!thandle.transpose_spmv() && thandle.is_column_major()) || - (thandle.transpose_spmv() && !thandle.is_column_major())); + ((!thandle.transpose_spmv() && thandle.is_column_major()) || + (thandle.transpose_spmv() && !thandle.is_column_major())); const char *tran = (transpose_spmv ? "T" : "N"); if (!transpose_spmv) { // U stored in CSR if (!invert_offdiagonal) { @@ -3488,27 +3511,27 @@ tstf); } // end elseif auto digmat = thandle.get_diagblock(lvl); KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); Kokkos::parallel_for( - "parfor_lsolve_supernode", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { - // zero out lhs corresponding to diagonal blocks in lhs, and copy to - // work - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + // zero out lhs corresponding to diagonal blocks in lhs, and copy + // to work + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + 1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); Kokkos::parallel_for( - "parfor_lsolve_supernode", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update with off-diagonals (potentiall combined with diagonal // solves) @@ -3516,17 +3539,17 @@ tstf); } // end elseif KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); } else { if (!invert_offdiagonal) { - // zero out lhs corresponding to diagonal blocks in lhs, and copy to - // work - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + // zero out lhs corresponding to diagonal blocks in lhs, and copy + // to work + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + 1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); Kokkos::parallel_for( - "parfor_lsolve_supernode", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); // update with off-diagonals auto submat = thandle.get_submatrix(lvl); @@ -3541,15 +3564,15 @@ tstf); } // end elseif } } // reinitialize workspace - SparseTriSupernodalSpMVFunctor - sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor( + 0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, + work); Kokkos::parallel_for( - "parfor_lsolve_supernode", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_finalize_functor); + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3574,17 +3597,19 @@ tstf); } // end elseif << std::endl; std::cout << " + Execution space : " << execution_space::name() << std::endl; - std::cout << " + Memory space : " << temp_mem_space::name() << std::endl; + std::cout << " + Memory space : " << temp_mem_space::name() + << std::endl; #endif } // end upper_tri_solve - template -static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, - const ValuesType values, const RHSType &rhs, LHSType &lhs, - const bool /*is_lowertri_*/) { + template + static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs, const bool /*is_lowertri_*/) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif @@ -3593,8 +3618,8 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, size_type num_chain_entries = thandle.get_num_chain_entries(); // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase + // during scheduling This requires making sure the host view in the handle + // is properly updated after the symbolic phase auto nodes_per_level = thandle.get_nodes_per_level(); auto hnodes_per_level = thandle.get_host_nodes_per_level(); @@ -3604,28 +3629,28 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, size_type node_count = 0; - // REFACTORED to cleanup; next, need debug and timer routines + // REFACTORED to cleanup; next, need debug and timer routines using large_cutoff_policy_type = - Kokkos::TeamPolicy; - /* - using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = - LowerTriLvlSchedTP1SolverFunctor; using UTP1Functor = - UpperTriLvlSchedTP1SolverFunctor; using LSingleBlockFunctor = - LowerTriLvlSchedTP1SingleBlockFunctor; using USingleBlockFunctor = - UpperTriLvlSchedTP1SingleBlockFunctor; - */ + Kokkos::TeamPolicy; + /* + using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = + LowerTriLvlSchedTP1SolverFunctor; using UTP1Functor = + UpperTriLvlSchedTP1SolverFunctor; using LSingleBlockFunctor = + LowerTriLvlSchedTP1SingleBlockFunctor; using USingleBlockFunctor = + UpperTriLvlSchedTP1SingleBlockFunctor; + */ using SingleBlockFunctor = - TriLvlSchedTP1SingleBlockFunctor; + TriLvlSchedTP1SingleBlockFunctor; int team_size = thandle.get_team_size(); int vector_size = - thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1; + thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1; auto cutoff = thandle.get_chain_threshold(); int team_size_singleblock = team_size; @@ -3636,19 +3661,20 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, // ts > 0 | cu 0 - set // ts > 0 | cu > 0 - set // Controls ts,cu > 0 - // co > ts - not all rows can be mapped to a thread - must call largercutoff - // impl co <= ts - okay, kernel must be careful not to access out-of-bounds; - // some threads idol + // co > ts - not all rows can be mapped to a thread - must call + // largercutoff impl co <= ts - okay, kernel must be careful not to access + // out-of-bounds; some threads idol if (team_size_singleblock <= 0 && cutoff == 0) { team_size_singleblock = 1; - // If cutoff == 0, no single-block calls will be made, team_size_singleblock - // is unimportant + // If cutoff == 0, no single-block calls will be made, + // team_size_singleblock is unimportant } // This is only necessary for Lower,UpperTri functor versions; else, // is_lowertri can be passed as arg to the generic Tri functor... if (is_lowertri) { - for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { + for (size_type chainlink = 0; chainlink < num_chain_entries; + ++chainlink) { size_type schain = h_chain_ptr(chainlink); size_type echain = h_chain_ptr(chainlink + 1); @@ -3657,27 +3683,27 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - true, node_count); + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + true, node_count); #else LowerTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count); #endif if (team_size == -1) { team_size = - team_policy(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); } size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? Kokkos::parallel_for( - "parfor_l_team_chain1", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_l_team_chain1", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3689,55 +3715,55 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, if (team_size_singleblock <= 0) { team_size_singleblock = - team_policy(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), - Kokkos::ParallelForTag()); + team_policy(space, 1, 1, vector_size) + .team_size_recommended( + SingleBlockFunctor(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain, is_lowertri), + Kokkos::ParallelForTag()); } if (cutoff <= team_size_singleblock) { #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, true); + TriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, true); #else - LowerTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType> - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); + LowerTriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain); #endif Kokkos::parallel_for( - "parfor_l_team_chainmulti", - Kokkos::Experimental::require( - team_policy(space, 1, team_size_singleblock, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_l_team_chainmulti", + Kokkos::Experimental::require( + team_policy(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, true, 0, - cutoff); + TriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, true, 0, + cutoff); #else - LowerTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType> - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, cutoff); + LowerTriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, cutoff); #endif Kokkos::parallel_for( - "parfor_l_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_l_team_chainmulti_cutoff", + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } node_count += lvl_nodes; } @@ -3748,7 +3774,8 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, } } else { - for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { + for (size_type chainlink = 0; chainlink < num_chain_entries; + ++chainlink) { size_type schain = h_chain_ptr(chainlink); size_type echain = h_chain_ptr(chainlink + 1); @@ -3757,29 +3784,29 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - is_lowertri, node_count); + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + is_lowertri, node_count); #else UpperTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count); #endif if (team_size == -1) { team_size = - team_policy(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); } // TODO To use cudagraph here, need to know how many non-unit chains // there are, create a graph for each and launch accordingly size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? Kokkos::parallel_for( - "parfor_u_team_chain1", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_u_team_chain1", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3792,58 +3819,58 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, if (team_size_singleblock <= 0) { // team_size_singleblock = team_policy(1, 1, // 1).team_size_recommended(SingleBlockFunctor(row_map, entries, - // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), - // Kokkos::ParallelForTag()); + // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, + // node_count), Kokkos::ParallelForTag()); team_size_singleblock = - team_policy(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), - Kokkos::ParallelForTag()); + team_policy(space, 1, 1, vector_size) + .team_size_recommended( + SingleBlockFunctor(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain, is_lowertri), + Kokkos::ParallelForTag()); } if (cutoff <= team_size_singleblock) { #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, is_lowertri); + TriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, is_lowertri); #else - UpperTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType> - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); + UpperTriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain); #endif Kokkos::parallel_for( - "parfor_u_team_chainmulti", - Kokkos::Experimental::require( - team_policy(space, 1, team_size_singleblock, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_u_team_chainmulti", + Kokkos::Experimental::require( + team_policy(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, is_lowertri, 0, - cutoff); + TriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, is_lowertri, + 0, cutoff); #else - UpperTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType> - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, cutoff); + UpperTriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, cutoff); #endif Kokkos::parallel_for( - "parfor_u_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_u_team_chainmulti_cutoff", + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } node_count += lvl_nodes; } @@ -3858,18 +3885,18 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, // -------------------------------- // Stream interfaces // -------------------------------- - template - static void lower_tri_solve_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, - std::vector &lhs_v) { + template + static void lower_tri_solve_streams( + const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, std::vector &lhs_v) { // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment using nodes_per_level_type = - typename TriSolveHandle::hostspace_nnz_lno_view_t; + typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; // Create vectors for handles' data in streams @@ -3900,56 +3927,57 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( - "parfor_fixed_lvl", - range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), - LowerTriLvlSchedRPSolverFunctor( - row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], - nodes_grouped_by_level_v[i])); + "parfor_fixed_lvl", + range_policy(execspace_v[i], node_count_v[i], + node_count_v[i] + lvl_nodes), + LowerTriLvlSchedRPSolverFunctor( + row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i])); } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: - SEQLVLSCHD_TP1) { - int team_size = thandle_v[i]->get_team_size(); + SEQLVLSCHD_TP1) { + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], true, - node_count_v[i]); + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], true, + node_count_v[i]); #else - LowerTriLvlSchedTP1SolverFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType> - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); + LowerTriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif if (team_size == -1) Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + "parfor_l_team", + team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); else Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, team_size), tstf); + "parfor_l_team", + team_policy(execspace_v[i], lvl_nodes, team_size), tstf); } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) } // end if (lvl < nlevels_v[i]) } // end for streams } // end for lvl - } // end lower_tri_solve_streams - - template - static void upper_tri_solve_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, - std::vector &lhs_v) { + } // end lower_tri_solve_streams + + template + static void upper_tri_solve_streams( + const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, std::vector &lhs_v) { // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment using nodes_per_level_type = - typename TriSolveHandle::hostspace_nnz_lno_view_t; + typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; // Create vectors for handles' data in streams @@ -3980,45 +4008,46 @@ static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( - "parfor_fixed_lvl", - range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), - UpperTriLvlSchedRPSolverFunctor( - row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], - nodes_grouped_by_level_v[i])); + "parfor_fixed_lvl", + range_policy(execspace_v[i], node_count_v[i], + node_count_v[i] + lvl_nodes), + UpperTriLvlSchedRPSolverFunctor( + row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i])); } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: - SEQLVLSCHD_TP1) { - int team_size = thandle_v[i]->get_team_size(); + SEQLVLSCHD_TP1) { + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], false, - node_count_v[i]); + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], false, + node_count_v[i]); #else - UpperTriLvlSchedTP1SolverFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType> - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); + UpperTriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif if (team_size == -1) Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + "parfor_l_team", + team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); else Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, team_size), tstf); - } + "parfor_l_team", + team_policy(execspace_v[i], lvl_nodes, team_size), tstf); + } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) } // end if (lvl < nlevels_v[i]) } // end for streams } // end for lvl - } // end upper_tri_solve_streams + } // end upper_tri_solve_streams -}; // struct SptrsvWrap +}; // struct SptrsvWrap } // namespace Experimental } // namespace Impl diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index ecdbdd3d73..d69c499c60 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -120,7 +120,8 @@ struct SPTRSV_SOLVE; + using Sptrsv = + Experimental::SptrsvWrap; // Call specific algorithm type auto sptrsv_handle = handle->get_sptrsv_handle(); @@ -129,19 +130,20 @@ struct SPTRSV_SOLVEis_lower_tri()) { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, - values, b, x, true); + Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, + b, x, true); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) // TODO: set stream in thandle's sptrsvCudaGraph - Sptrsv::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, + b, x); else #endif Sptrsv::lower_tri_solve(space, *sptrsv_handle, row_map, entries, @@ -149,19 +151,20 @@ struct SPTRSV_SOLVEis_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, - values, b, x, false); + Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, + b, x, false); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) // TODO: set stream in thandle's sptrsvCudaGraph - Sptrsv::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, + b, x); else #endif Sptrsv::upper_tri_solve(space, *sptrsv_handle, row_map, entries, @@ -178,7 +181,8 @@ struct SPTRSV_SOLVE &entries_v, const std::vector &values_v, const std::vector &b_v, std::vector &x_v) { - using Sptrsv = Experimental::SptrsvWrap; + using Sptrsv = + Experimental::SptrsvWrap; // Call specific algorithm type // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment // Assume streams have the same either lower or upper matrix type @@ -198,9 +202,8 @@ struct SPTRSV_SOLVE(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { @@ -209,9 +212,8 @@ struct SPTRSV_SOLVEdestroy_sptrsv_handle(); this->is_owner_of_the_sptrsv_handle = true; - this->sptrsvHandle = new SPTRSVHandleType(algm, nrows, lower_tri, block_size); + this->sptrsvHandle = + new SPTRSVHandleType(algm, nrows, lower_tri, block_size); // this->sptrsvHandle->init_handle(nrows); this->sptrsvHandle->set_team_size(this->team_work_size); this->sptrsvHandle->set_vector_size(this->vector_size); diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index caa07ab07d..fb322b7f95 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -76,45 +76,59 @@ class SPTRSVHandle { using const_nnz_scalar_t = const scalar_t; // Row_map type (managed memory) - using nnz_row_view_temp_t = typename Kokkos::View; - using nnz_row_view_t = typename Kokkos::View; + using nnz_row_view_temp_t = + typename Kokkos::View; + using nnz_row_view_t = + typename Kokkos::View; using host_nnz_row_view_t = typename nnz_row_view_t::HostMirror; - using int_row_view_t = typename Kokkos::View; - using int64_row_view_t = typename Kokkos::View; + using int_row_view_t = + typename Kokkos::View; + using int64_row_view_t = + typename Kokkos::View; // typedef typename row_lno_persistent_work_view_t::HostMirror // row_lno_persistent_work_host_view_t; //Host view type using nnz_row_unmanaged_view_t = typename Kokkos::View< const size_type *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits>; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // values type (managed memory) - using nnz_scalar_view_temp_t = typename Kokkos::View; - using nnz_scalar_view_t = typename Kokkos::View; + using nnz_scalar_view_temp_t = + typename Kokkos::View; + using nnz_scalar_view_t = + typename Kokkos::View; using host_nnz_scalar_view_t = typename nnz_scalar_view_t::HostMirror; using nnz_scalar_unmanaged_view_t = typename Kokkos::View< const scalar_t *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits>; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // entries type (managed memory) - using nnz_lno_view_temp_t = typename Kokkos::View; - using nnz_lno_view_t = typename Kokkos::View; - using hostspace_nnz_lno_view_t = typename Kokkos::View; + using nnz_lno_view_temp_t = + typename Kokkos::View; + using nnz_lno_view_t = + typename Kokkos::View; + using hostspace_nnz_lno_view_t = + typename Kokkos::View; using host_nnz_lno_view_t = typename nnz_lno_view_t::HostMirror; using nnz_lno_unmanaged_view_t = typename Kokkos::View< const nnz_lno_t *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits>; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // typedef typename nnz_lno_persistent_work_view_t::HostMirror // nnz_lno_persistent_work_host_view_t; //Host view type - using signed_integral_t = typename std::make_signed::type; - using signed_nnz_lno_view_t = Kokkos::View; + using signed_integral_t = typename std::make_signed< + typename nnz_row_view_t::non_const_value_type>::type; + using signed_nnz_lno_view_t = + Kokkos::View; using host_signed_nnz_lno_view_t = typename signed_nnz_lno_view_t::HostMirror; - using mtx_scalar_view_t = typename Kokkos::View; + using mtx_scalar_view_t = + typename Kokkos::View; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #if (CUDA_VERSION >= 11030) @@ -285,7 +299,7 @@ class SPTRSVHandle { nnz_lno_view_t nodes_grouped_by_level; hostspace_nnz_lno_view_t hnodes_grouped_by_level; // NEW size_type nlevel; - size_type block_size; // block_size > 0 implies BSR + size_type block_size; // block_size > 0 implies BSR int team_size; int vector_size; @@ -413,8 +427,8 @@ class SPTRSVHandle { public: SPTRSVHandle(SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, - const size_type block_size_ = 0, - bool symbolic_complete_ = false, bool numeric_complete_ = false) + const size_type block_size_ = 0, bool symbolic_complete_ = false, + bool numeric_complete_ = false) : #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT cudagraphCreated(false), From 007b1fe569d1ebf7c035d79095728b49fc2d42f1 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 25 Jun 2024 16:49:38 -0600 Subject: [PATCH 05/41] Cleanup progress --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 446 ++++-------------- sparse/src/KokkosSparse_sptrsv_handle.hpp | 2 + 2 files changed, 96 insertions(+), 352 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index d385a390cd..36e2e850e3 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -103,7 +103,7 @@ struct SptrsvWrap { // This functor unifies the lower and upper implementations, the hope is the // "is_lowertri" check does not add noticable time on larger problems template + class LHSType, class RHSType, bool IsLower> struct TriLvlSchedTP1SolverFunctor { RowMapType row_map; EntriesType entries; @@ -112,8 +112,6 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - const bool is_lowertri; - long node_count; // like "block" offset into ngbl, my_league is the "local" // offset @@ -122,7 +120,6 @@ struct SptrsvWrap { const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, - const bool &is_lowertri_, const long &node_count_) : row_map(row_map_), entries(entries_), @@ -130,7 +127,6 @@ struct SptrsvWrap { lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), - is_lowertri(is_lowertri_), node_count(node_count_) {} KOKKOS_INLINE_FUNCTION @@ -148,8 +144,7 @@ struct SptrsvWrap { Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); - - auto val = values(ptr); + auto val = values(ptr); if (colid != rowid) { tdiff = tdiff - val * lhs(colid); } @@ -162,8 +157,8 @@ struct SptrsvWrap { // only one thread should do this; can also use Kokkos::single if (my_rank == 0) { // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); + lhs(rowid) = IsLower ? (rhs_rowid + diff) / values(eoffset - 1) + : (rhs_rowid + diff) / values(soffset); } } @@ -382,8 +377,8 @@ struct SptrsvWrap { // Lower vs Upper Multi-block Functors template - struct LowerTriLvlSchedRPSolverFunctor { + class LHSType, class RHSType, bool IsLower> + struct TriLvlSchedRPSolverFunctor { RowMapType row_map; EntriesType entries; ValuesType values; @@ -391,11 +386,11 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_) + TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_) : row_map(row_map_), entries(entries_), values(values_), @@ -408,12 +403,11 @@ struct SptrsvWrap { auto rowid = nodes_grouped_by_level(i); // Assuming indices are sorted per row, diag entry is final index in the // list - long soffset = row_map(rowid); long eoffset = row_map(rowid + 1); auto rhs_rowid = rhs(rowid); - - for (long ptr = soffset; ptr < eoffset; ++ptr) { + for (long ptr = IsLower ? soffset : eoffset - 1; + (IsLower && (ptr < eoffset)) || (!IsLower && (ptr >= soffset)); ptr+=(IsLower ? 1 : -1)) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { @@ -431,8 +425,8 @@ struct SptrsvWrap { long eoffset = row_map(rowid + 1); auto rhs_rowid = rhs(rowid); auto diag = -1; - - for (long ptr = soffset; ptr < eoffset; ++ptr) { + for (long ptr = IsLower ? soffset : eoffset - 1; + (IsLower && (ptr < eoffset)) || (!IsLower && (ptr >= soffset)); ptr+=(IsLower ? 1 : -1)) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { @@ -445,102 +439,6 @@ struct SptrsvWrap { } }; - template - struct LowerTriLvlSchedTP1SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } - } - }; - // FIXME CUDA: This algorithm not working with all integral type combos // In any case, this serves as a skeleton for 3-level hierarchical parallelism // for alg dev @@ -1316,102 +1214,6 @@ struct SptrsvWrap { } }; - template - struct UpperTriLvlSchedTP1SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this, also can use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at start offset - lhs(rowid) = (rhs_rowid + diff) / values(soffset); - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this, also can use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } - } - }; - // FIXME CUDA: This algorithm not working with all integral type combos // In any case, this serves as a skeleton for 3-level hierarchical parallelism // for alg dev @@ -2529,6 +2331,10 @@ struct SptrsvWrap { } // end tagged operator }; + // + // End of functors, begin external API + // + #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT template @@ -2577,8 +2383,8 @@ struct SptrsvWrap { Kokkos::Experimental::require( policy, Kokkos::Experimental::WorkItemProperty::HintLightWeight), - LowerTriLvlSchedTP1SolverFunctor( + TriLvlSchedTP1SolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); @@ -2646,8 +2452,8 @@ struct SptrsvWrap { Kokkos::Experimental::require( policy, Kokkos::Experimental::WorkItemProperty::HintLightWeight), - UpperTriLvlSchedTP1SolverFunctor( + TriLvlSchedTP1SolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); @@ -2671,6 +2477,9 @@ struct SptrsvWrap { #endif +#define FunctorTypeMacro(Functor, IsLower) \ + Functor + template static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, @@ -2681,13 +2490,23 @@ struct SptrsvWrap { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - auto nlevels = thandle.get_num_levels(); + const auto nlevels = thandle.get_num_levels(); // Keep this a host View, create device version and copy to back to host // during scheduling This requires making sure the host view in the handle // is properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + const auto nodes_per_level = thandle.get_nodes_per_level(); + const auto hnodes_per_level = thandle.get_host_nodes_per_level(); + const auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + //const auto block_size = thandle.get_block_size(); + const auto block_enabled = thandle.is_block_enabled(); + + KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported"); + + // Set up functor types + using LowerRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true); + //using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, true); + using LowerTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true); + //using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, true); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -2744,6 +2563,12 @@ struct SptrsvWrap { sptrsv_timer.reset(); #endif + // Create basic functors + LowerRPPoint lrpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level); + //LowerRPBlock lrpb(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, block_size); + LowerTPPoint ltpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); + //LowerTPBlock ltpb(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); + for (size_type lvl = 0; lvl < nlevels; ++lvl) { size_type lvl_nodes = hnodes_per_level(lvl); @@ -2756,41 +2581,20 @@ struct SptrsvWrap { Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( - range_policy(space, node_count, node_count + lvl_nodes), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - LowerTriLvlSchedRPSolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); + range_policy(space, node_count, node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + lrpp); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { int team_size = thandle.get_team_size(); - -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - true, node_count); -#else - LowerTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); -#endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_l_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - else - Kokkos::parallel_for( - "parfor_l_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + Kokkos::parallel_for( + "parfor_l_team", + Kokkos::Experimental::require( + tp, + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + ltpp); } // TP2 algorithm has issues with some offset-ordinal combo to be // addressed @@ -3088,10 +2892,17 @@ struct SptrsvWrap { // is properly updated after the symbolic phase auto nodes_per_level = thandle.get_nodes_per_level(); auto hnodes_per_level = thandle.get_host_nodes_per_level(); - // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); - // Kokkos::deep_copy(hnodes_per_level, nodes_per_level); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + //const auto block_size = thandle.get_block_size(); + const auto block_enabled = thandle.is_block_enabled(); + + KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported"); + + // Set up functor types + using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false); + //using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, true); + using UpperTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false); + //using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, true); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -3141,6 +2952,12 @@ struct SptrsvWrap { size_type node_count = 0; + // Create basic functors + UpperRPPoint urpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level); + //UpperRPBlock urpb(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, block_size); + UpperTPPoint utpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); + //UpperTPBlock utpb(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); + // This must stay serial; would be nice to try out Cuda's graph stuff to // reduce kernel launch overhead #ifdef profile_supernodal_etree @@ -3162,39 +2979,18 @@ struct SptrsvWrap { Kokkos::Experimental::require( range_policy(space, node_count, node_count + lvl_nodes), Kokkos::Experimental::WorkItemProperty::HintLightWeight), - UpperTriLvlSchedRPSolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); + urpp); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { int team_size = thandle.get_team_size(); - -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - false, node_count); -#else - UpperTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); -#endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - else - Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + tp, + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + utpp); } // TP2 algorithm has issues with some offset-ordinal combo to be // addressed @@ -3632,18 +3428,6 @@ tstf); } // end elseif // REFACTORED to cleanup; next, need debug and timer routines using large_cutoff_policy_type = Kokkos::TeamPolicy; - /* - using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = - LowerTriLvlSchedTP1SolverFunctor; using UTP1Functor = - UpperTriLvlSchedTP1SolverFunctor; using LSingleBlockFunctor = - LowerTriLvlSchedTP1SingleBlockFunctor; using USingleBlockFunctor = - UpperTriLvlSchedTP1SingleBlockFunctor; - */ using SingleBlockFunctor = TriLvlSchedTP1SingleBlockFunctor; @@ -3680,17 +3464,10 @@ tstf); } // end elseif if (echain - schain == 1) { // if team_size is -1 (unset), get recommended size from Kokkos -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - true, node_count); -#else - LowerTriLvlSchedTP1SolverFunctor + LHSType, RHSType, true> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); -#endif if (team_size == -1) { team_size = team_policy(space, 1, 1, vector_size) @@ -3781,17 +3558,10 @@ tstf); } // end elseif if (echain - schain == 1) { // if team_size is -1 (unset), get recommended size from Kokkos -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - is_lowertri, node_count); -#else - UpperTriLvlSchedTP1SolverFunctor + LHSType, RHSType, false> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); -#endif if (team_size == -1) { team_size = team_policy(space, 1, 1, vector_size) @@ -3898,6 +3668,8 @@ tstf); } // end elseif using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + using LowerRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true); + using LowerTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true); // Create vectors for handles' data in streams int nstreams = execspace_v.size(); @@ -3930,34 +3702,18 @@ tstf); } // end elseif "parfor_fixed_lvl", range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), - LowerTriLvlSchedRPSolverFunctor( + LowerRPPoint( row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { int team_size = thandle_v[i]->get_team_size(); -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], true, - node_count_v[i]); -#else - LowerTriLvlSchedTP1SolverFunctor + auto tp = team_size == -1 ? team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO) : team_policy(execspace_v[i], lvl_nodes, team_size); + LowerTPPoint tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); -#endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, team_size), tstf); + Kokkos::parallel_for("parfor_l_team", tp, tstf); } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) @@ -3979,6 +3735,8 @@ tstf); } // end elseif using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true); + using UpperTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true); // Create vectors for handles' data in streams int nstreams = execspace_v.size(); @@ -4011,34 +3769,18 @@ tstf); } // end elseif "parfor_fixed_lvl", range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), - UpperTriLvlSchedRPSolverFunctor( + UpperRPPoint( row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { int team_size = thandle_v[i]->get_team_size(); -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], false, - node_count_v[i]); -#else - UpperTriLvlSchedTP1SolverFunctor + auto tp = team_size == -1 ? team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO) : team_policy(execspace_v[i], lvl_nodes, team_size); + UpperTPPoint tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); -#endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, team_size), tstf); + Kokkos::parallel_for("parfor_l_team", tp, tstf); } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index fb322b7f95..53e9926f62 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -1021,6 +1021,8 @@ class SPTRSVHandle { this->block_size = block_size_; } + bool is_block_enabled() const { return block_size > 0; } + void set_symbolic_complete() { this->symbolic_complete = true; } void set_symbolic_incomplete() { this->symbolic_complete = false; } From ec90e8bf3dc81d108324af8527596cc728eb0a3d Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 26 Jun 2024 13:22:01 -0600 Subject: [PATCH 06/41] Fixes --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 36e2e850e3..8a465de2cc 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2563,12 +2563,6 @@ struct SptrsvWrap { sptrsv_timer.reset(); #endif - // Create basic functors - LowerRPPoint lrpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level); - //LowerRPBlock lrpb(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, block_size); - LowerTPPoint ltpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); - //LowerTPBlock ltpb(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); - for (size_type lvl = 0; lvl < nlevels; ++lvl) { size_type lvl_nodes = hnodes_per_level(lvl); @@ -2578,6 +2572,8 @@ struct SptrsvWrap { #endif if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + LowerRPPoint lrpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level); + Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( @@ -2587,6 +2583,7 @@ struct SptrsvWrap { } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { + LowerTPPoint ltpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); int team_size = thandle.get_team_size(); auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); Kokkos::parallel_for( @@ -2952,12 +2949,6 @@ struct SptrsvWrap { size_type node_count = 0; - // Create basic functors - UpperRPPoint urpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level); - //UpperRPBlock urpb(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, block_size); - UpperTPPoint utpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); - //UpperTPBlock utpb(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); - // This must stay serial; would be nice to try out Cuda's graph stuff to // reduce kernel launch overhead #ifdef profile_supernodal_etree @@ -2974,6 +2965,7 @@ struct SptrsvWrap { if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + UpperRPPoint urpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level); Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( @@ -2983,6 +2975,7 @@ struct SptrsvWrap { } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { + UpperTPPoint utpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); int team_size = thandle.get_team_size(); auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); Kokkos::parallel_for( @@ -3735,8 +3728,8 @@ tstf); } // end elseif using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true); - using UpperTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true); + using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false); + using UpperTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false); // Create vectors for handles' data in streams int nstreams = execspace_v.size(); From 9766f4cc34c9e29a6b6ef77ee0daf74430067654 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 26 Jun 2024 13:53:49 -0600 Subject: [PATCH 07/41] Remove Upper/Lower TriLvlSchedTP2SolverFunctors --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 219 +----------------- 1 file changed, 3 insertions(+), 216 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 8a465de2cc..86270aada9 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -266,7 +266,7 @@ struct SptrsvWrap { }; template + class LHSType, class RHSType, bool IsLower> struct TriLvlSchedTP2SolverFunctor { RowMapType row_map; EntriesType entries; @@ -275,7 +275,6 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - const bool is_lowertri; long node_count; // like "block" offset into ngbl, my_league is the "local" // offset long node_groups; @@ -286,7 +285,7 @@ struct SptrsvWrap { const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, - const bool is_lowertri_, long node_count_, + long node_count_, long node_groups_ = 0, long dense_nrows_ = 0) : row_map(row_map_), entries(entries_), @@ -294,7 +293,6 @@ struct SptrsvWrap { lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), - is_lowertri(is_lowertri_), node_count(node_count_), node_groups(node_groups_), dense_nrows(dense_nrows_) {} @@ -327,7 +325,7 @@ struct SptrsvWrap { diff); // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = is_lowertri + lhs(rowid) = IsLower ? (rhs_rowid + diff) / values(eoffset - 1) : (rhs_rowid + diff) / values(soffset); } // end if @@ -439,112 +437,6 @@ struct SptrsvWrap { } }; - // FIXME CUDA: This algorithm not working with all integral type combos - // In any case, this serves as a skeleton for 3-level hierarchical parallelism - // for alg dev - template - struct LowerTriLvlSchedTP2SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); - } - }; - #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) // ----------------------------------------------------------- // Helper functors for Lower-triangular solve with SpMV @@ -1214,111 +1106,6 @@ struct SptrsvWrap { } }; - // FIXME CUDA: This algorithm not working with all integral type combos - // In any case, this serves as a skeleton for 3-level hierarchical parallelism - // for alg dev - template - struct UpperTriLvlSchedTP2SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at start offset - lhs(rowid) = (rhs_rowid + diff) / values(soffset); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); - } - }; - // -------------------------------- // Single-block functors // -------------------------------- From f8d0f2122f6a6400ec94a461ee6db8052bd0394a Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 26 Jun 2024 14:12:58 -0600 Subject: [PATCH 08/41] Remove Upper/Lower single block functors --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 698 +----------------- 1 file changed, 26 insertions(+), 672 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 86270aada9..0736524569 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -1045,626 +1045,12 @@ struct SptrsvWrap { }; #endif - template - struct UpperTriLvlSchedRPSolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - // Assuming indices are sorted per row, diag entry is final index in the - // list - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - lhs(rowid) = rhs_rowid / val; - } - } // end for ptr - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - auto diag = -1; - for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - diag = ptr; - } - } // end for ptr - lhs(rowid) = rhs_rowid / values(diag); - } - }; - - // -------------------------------- - // Single-block functors - // -------------------------------- - - template - struct LowerTriLvlSchedTP1SingleBlockFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - entries_t nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - long cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - LowerTriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for - // lower tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - }; - - template - struct UpperTriLvlSchedTP1SingleBlockFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - entries_t nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - long cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - UpperTriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at soffset - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl each thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl each thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for - // lower tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - }; + // -------------------------------- + // Single-block functors + // -------------------------------- template + class LHSType, class RHSType, bool IsLower> struct TriLvlSchedTP1SingleBlockFunctor { RowMapType row_map; EntriesType entries; @@ -1678,7 +1064,6 @@ struct SptrsvWrap { // offset long lvl_start; long lvl_end; - const bool is_lowertri; const int dense_nrows; const int cutoff; // team_size: each team can be assigned a row, if there are enough rows... @@ -1687,7 +1072,7 @@ struct SptrsvWrap { const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, + long node_count_, long lvl_start_, long lvl_end_, const int dense_nrows_ = 0, const int cutoff_ = 0) : row_map(row_map_), entries(entries_), @@ -1699,7 +1084,6 @@ struct SptrsvWrap { node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), - is_lowertri(is_lower_), dense_nrows(dense_nrows_), cutoff(cutoff_) {} @@ -1752,7 +1136,7 @@ struct SptrsvWrap { // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower // tri, soffset for upper tri - if (is_lowertri) + if (IsLower) lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); else lhs(rowid) = (rhs_val + diff) / values(soffset); @@ -1875,7 +1259,7 @@ struct SptrsvWrap { // ASSUMPTION: sorted diagonal value located at eoffset - 1 for // lower tri, soffset for upper tri - if (is_lowertri) + if (IsLower) lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); else lhs(rowid) = (rhs_val + diff) / values(soffset); @@ -3208,9 +2592,12 @@ tstf); } // end elseif // REFACTORED to cleanup; next, need debug and timer routines using large_cutoff_policy_type = Kokkos::TeamPolicy; - using SingleBlockFunctor = + using SingleBlockFunctorLower = + TriLvlSchedTP1SingleBlockFunctor; + using SingleBlockFunctorUpper = TriLvlSchedTP1SingleBlockFunctor; + LHSType, RHSType, false>; int team_size = thandle.get_team_size(); int vector_size = @@ -3274,25 +2661,17 @@ tstf); } // end elseif team_size_singleblock = team_policy(space, 1, 1, vector_size) .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), + SingleBlockFunctorLower(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain, is_lowertri), Kokkos::ParallelForTag()); } if (cutoff <= team_size_singleblock) { -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, true); -#else - LowerTriLvlSchedTP1SingleBlockFunctor + SingleBlockFunctorLower tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain); -#endif Kokkos::parallel_for( "parfor_l_team_chainmulti", Kokkos::Experimental::require( @@ -3302,18 +2681,10 @@ tstf); } // end elseif } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor + SingleBlockFunctorLower tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, true, 0, + nodes_per_level, node_count, schain, echain, 0, cutoff); -#else - LowerTriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, cutoff); -#endif Kokkos::parallel_for( "parfor_l_team_chainmulti_cutoff", Kokkos::Experimental::require( @@ -3374,25 +2745,17 @@ tstf); } // end elseif team_size_singleblock = team_policy(space, 1, 1, vector_size) .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), + SingleBlockFunctorUpper(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain), Kokkos::ParallelForTag()); } if (cutoff <= team_size_singleblock) { -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, is_lowertri); -#else - UpperTriLvlSchedTP1SingleBlockFunctor + SingleBlockFunctorUpper tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain); -#endif Kokkos::parallel_for( "parfor_u_team_chainmulti", Kokkos::Experimental::require( @@ -3402,18 +2765,9 @@ tstf); } // end elseif } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, is_lowertri, - 0, cutoff); -#else - UpperTriLvlSchedTP1SingleBlockFunctor + SingleBlockFunctorUpper tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, cutoff); -#endif + nodes_per_level, node_count, schain, echain, 0, cutoff); Kokkos::parallel_for( "parfor_u_team_chainmulti_cutoff", Kokkos::Experimental::require( From de13ba2411e71dae71f0b2c281ee6fd480ad9a1b Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 26 Jun 2024 14:23:36 -0600 Subject: [PATCH 09/41] Remove unused TriLvlSchedTP1SingleBlockFunctorDiagValues and merge upper/lower tri_solve_cg --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 248 +----------------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 6 +- 2 files changed, 11 insertions(+), 243 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 0736524569..c520766ebf 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -1340,180 +1340,18 @@ struct SptrsvWrap { } // end tagged operator }; - template - struct TriLvlSchedTP1SingleBlockFunctorDiagValues { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - entries_t nodes_per_level; - ValuesType diagonal_values; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - const bool is_lowertri; - const int dense_nrows; - const int cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - TriLvlSchedTP1SingleBlockFunctorDiagValues( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const entries_t &nodes_per_level_, const ValuesType &diagonal_values_, - long node_count_, const long lvl_start_, const long lvl_end_, - const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - diagonal_values(diagonal_values_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - is_lowertri(is_lower_), - dense_nrows(dense_nrows_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - }; - // // End of functors, begin external API // #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT - template - static void lower_tri_solve_cg(TriSolveHandle &thandle, - const RowMapType row_map, - const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { + static void tri_solve_cg(TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = thandle.get_sptrsvCudaGraph(); @@ -1527,7 +1365,6 @@ struct SptrsvWrap { EmptyFunctor()); Kokkos::Cuda().fence(); cudaStreamSynchronize(stream1); - // Kokkos::fence(); auto hnodes_per_level = thandle.get_host_nodes_per_level(); auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); @@ -1555,76 +1392,7 @@ struct SptrsvWrap { policy, Kokkos::Experimental::WorkItemProperty::HintLightWeight), TriLvlSchedTP1SolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count)); - - node_count += hnodes_per_level(iter); - } - } - cudaStreamEndCapture(stream1, &graph); - - // Create graphExec - cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, - NULL, 0); - thandle.cudagraphCreated = true; - } - // Run graph - Kokkos::fence(); - cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); - - cudaStreamSynchronize(stream1); - Kokkos::fence(); - } // end lower_tri_solve_cg - - template - static void upper_tri_solve_cg(TriSolveHandle &thandle, - const RowMapType row_map, - const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { - typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = - thandle.get_sptrsvCudaGraph(); - - auto nlevels = thandle.get_num_levels(); - - auto stream1 = lcl_cudagraph->stream; - Kokkos::Cuda cuda1(stream1); - auto graph = lcl_cudagraph->cudagraph; - - Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), - EmptyFunctor()); - Kokkos::Cuda().fence(); - cudaStreamSynchronize(stream1); - - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - - size_type node_count = 0; - - int team_size = thandle.get_team_size(); - team_size = team_size == -1 ? 64 : team_size; - - // Start capturing stream - if (thandle.cudagraphCreated == false) { - Kokkos::fence(); - cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); - { - for (int iter = 0; iter < nlevels; ++iter) { - size_type lvl_nodes = hnodes_per_level(iter); - - auto policy = std::is_same::value - ? team_policy(lvl_nodes, team_size, cuda1) - : team_policy(lvl_nodes, team_size); - - Kokkos::parallel_for( - "parfor_u_team_cudagraph", - Kokkos::Experimental::require( - policy, - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - TriLvlSchedTP1SolverFunctor( + ValuesType, LHSType, RHSType, IsLower>( row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); @@ -1644,7 +1412,7 @@ struct SptrsvWrap { cudaStreamSynchronize(stream1); Kokkos::fence(); - } // end upper_tri_solve_cg + } // end tri_solve_cg #endif diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index d69c499c60..9537e86d5a 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -142,7 +142,7 @@ struct SPTRSV_SOLVE::value) // TODO: set stream in thandle's sptrsvCudaGraph - Sptrsv::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, + Sptrsv::tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif @@ -163,8 +163,8 @@ struct SPTRSV_SOLVE::value) // TODO: set stream in thandle's sptrsvCudaGraph - Sptrsv::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, - b, x); + Sptrsv::tri_solve_cg(*sptrsv_handle, row_map, entries, values, + b, x); else #endif Sptrsv::upper_tri_solve(space, *sptrsv_handle, row_map, entries, From 88d36d678c0075099e1e45dd46da07238d584af5 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 26 Jun 2024 14:38:09 -0600 Subject: [PATCH 10/41] Merge two big upper/lower branch of tri_solve_chain --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 230 ++++++------------ .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 8 +- 2 files changed, 73 insertions(+), 165 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index c520766ebf..24c04a3798 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2331,13 +2331,13 @@ tstf); } // end elseif } // end upper_tri_solve - template static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, - LHSType &lhs, const bool /*is_lowertri_*/) { + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif @@ -2353,19 +2353,14 @@ tstf); } // end elseif auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - const bool is_lowertri = thandle.is_lower_tri(); - size_type node_count = 0; // REFACTORED to cleanup; next, need debug and timer routines using large_cutoff_policy_type = Kokkos::TeamPolicy; - using SingleBlockFunctorLower = + using SingleBlockFunctor = TriLvlSchedTP1SingleBlockFunctor; - using SingleBlockFunctorUpper = - TriLvlSchedTP1SingleBlockFunctor; + LHSType, RHSType, IsLower>; int team_size = thandle.get_team_size(); int vector_size = @@ -2389,168 +2384,81 @@ tstf); } // end elseif // team_size_singleblock is unimportant } - // This is only necessary for Lower,UpperTri functor versions; else, - // is_lowertri can be passed as arg to the generic Tri functor... - if (is_lowertri) { - for (size_type chainlink = 0; chainlink < num_chain_entries; - ++chainlink) { - size_type schain = h_chain_ptr(chainlink); - size_type echain = h_chain_ptr(chainlink + 1); - - if (echain - schain == 1) { - // if team_size is -1 (unset), get recommended size from Kokkos - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); - if (team_size == -1) { - team_size = - team_policy(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); - } - - size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for( - "parfor_l_team_chain1", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - node_count += lvl_nodes; - - } else { - size_type lvl_nodes = 0; + for (size_type chainlink = 0; chainlink < num_chain_entries; + ++chainlink) { + size_type schain = h_chain_ptr(chainlink); + size_type echain = h_chain_ptr(chainlink + 1); + + if (echain - schain == 1) { + // if team_size is -1 (unset), get recommended size from Kokkos + TriLvlSchedTP1SolverFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count); + if (team_size == -1) { + team_size = + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); + } - for (size_type i = schain; i < echain; ++i) { - lvl_nodes += hnodes_per_level(i); - } + size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? + Kokkos::parallel_for( + "parfor_l_team_chain1", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); + node_count += lvl_nodes; - if (team_size_singleblock <= 0) { - team_size_singleblock = - team_policy(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctorLower(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), - Kokkos::ParallelForTag()); - } + } else { + size_type lvl_nodes = 0; - if (cutoff <= team_size_singleblock) { - SingleBlockFunctorLower - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); - Kokkos::parallel_for( - "parfor_l_team_chainmulti", - Kokkos::Experimental::require( - team_policy(space, 1, team_size_singleblock, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } else { - // team_size_singleblock < cutoff => kernel must allow for a - // block-stride internally - SingleBlockFunctorLower - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, 0, - cutoff); - Kokkos::parallel_for( - "parfor_l_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } - node_count += lvl_nodes; + for (size_type i = schain; i < echain; ++i) { + lvl_nodes += hnodes_per_level(i); } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? - } - } else { - for (size_type chainlink = 0; chainlink < num_chain_entries; - ++chainlink) { - size_type schain = h_chain_ptr(chainlink); - size_type echain = h_chain_ptr(chainlink + 1); - - if (echain - schain == 1) { - // if team_size is -1 (unset), get recommended size from Kokkos - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); - if (team_size == -1) { - team_size = - team_policy(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); - } + if (team_size_singleblock <= 0) { + team_size_singleblock = + team_policy(space, 1, 1, vector_size) + .team_size_recommended( + SingleBlockFunctor(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain), + Kokkos::ParallelForTag()); + } - // TODO To use cudagraph here, need to know how many non-unit chains - // there are, create a graph for each and launch accordingly - size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? + if (cutoff <= team_size_singleblock) { + SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain); Kokkos::parallel_for( - "parfor_u_team_chain1", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - node_count += lvl_nodes; - + "parfor_l_team_chainmulti", + Kokkos::Experimental::require( + team_policy(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { - size_type lvl_nodes = 0; - - for (size_type i = schain; i < echain; ++i) { - lvl_nodes += hnodes_per_level(i); - } - - if (team_size_singleblock <= 0) { - // team_size_singleblock = team_policy(1, 1, - // 1).team_size_recommended(SingleBlockFunctor(row_map, entries, - // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, - // node_count), Kokkos::ParallelForTag()); - team_size_singleblock = - team_policy(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctorUpper(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain), - Kokkos::ParallelForTag()); - } - - if (cutoff <= team_size_singleblock) { - SingleBlockFunctorUpper - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); - Kokkos::parallel_for( - "parfor_u_team_chainmulti", - Kokkos::Experimental::require( - team_policy(space, 1, team_size_singleblock, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } else { - // team_size_singleblock < cutoff => kernel must allow for a - // block-stride internally - SingleBlockFunctorUpper - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, 0, cutoff); - Kokkos::parallel_for( - "parfor_u_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } - node_count += lvl_nodes; + // team_size_singleblock < cutoff => kernel must allow for a + // block-stride internally + SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, 0, + cutoff); + Kokkos::parallel_for( + "parfor_l_team_chainmulti_cutoff", + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? + node_count += lvl_nodes; } + // TODO: space.fence() + Kokkos::fence(); // TODO - is this necessary? that is, can the + // parallel_for launch before the s/echain values have + // been updated? } } // end tri_solve_chain diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 9537e86d5a..2faf27d4a8 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -135,8 +135,8 @@ struct SPTRSV_SOLVEget_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, - b, x, true); + Sptrsv::template tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, + b, x); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; @@ -156,8 +156,8 @@ struct SPTRSV_SOLVEget_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, - b, x, false); + Sptrsv::template tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, + b, x); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; From 43591122420797a5f0fc3edfd251ab8a76c3df76 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 26 Jun 2024 14:47:35 -0600 Subject: [PATCH 11/41] Merge upper/lower tri_solve_streams --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 81 ++----------------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 8 +- 2 files changed, 11 insertions(+), 78 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 24c04a3798..7a131f2545 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2465,76 +2465,9 @@ tstf); } // end elseif // -------------------------------- // Stream interfaces // -------------------------------- - template - static void lower_tri_solve_streams( - const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, std::vector &lhs_v) { - // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment - using nodes_per_level_type = - typename TriSolveHandle::hostspace_nnz_lno_view_t; - using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - using LowerRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true); - using LowerTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true); - - // Create vectors for handles' data in streams - int nstreams = execspace_v.size(); - std::vector nlevels_v(nstreams); - std::vector hnodes_per_level_v(nstreams); - std::vector nodes_grouped_by_level_v(nstreams); - std::vector node_count_v(nstreams); - - // Retrieve data from handles and find max. number of levels among streams - size_type nlevels_max = 0; - for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); - nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); - node_count_v[i] = 0; - if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; - } - - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // 1. Launch work on all streams - for (int i = 0; i < nstreams; i++) { - // Only if stream i-th still has this level - if (lvl < nlevels_v[i]) { - size_type lvl_nodes = hnodes_per_level_v[i](lvl); - if (lvl_nodes != 0) { - if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( - "parfor_fixed_lvl", - range_policy(execspace_v[i], node_count_v[i], - node_count_v[i] + lvl_nodes), - LowerRPPoint( - row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i])); - } else if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm:: - SEQLVLSCHD_TP1) { - int team_size = thandle_v[i]->get_team_size(); - auto tp = team_size == -1 ? team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO) : team_policy(execspace_v[i], lvl_nodes, team_size); - LowerTPPoint - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); - Kokkos::parallel_for("parfor_l_team", tp, tstf); - } - node_count_v[i] += lvl_nodes; - } // end if (lvl_nodes != 0) - } // end if (lvl < nlevels_v[i]) - } // end for streams - } // end for lvl - } // end lower_tri_solve_streams - - template - static void upper_tri_solve_streams( + static void tri_solve_streams( const std::vector &execspace_v, const std::vector &thandle_v, const std::vector &row_map_v, @@ -2545,8 +2478,8 @@ tstf); } // end elseif using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false); - using UpperTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false); + using RPPointFunctor = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, IsLower); + using TPPointFunctor = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, IsLower); // Create vectors for handles' data in streams int nstreams = execspace_v.size(); @@ -2579,7 +2512,7 @@ tstf); } // end elseif "parfor_fixed_lvl", range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), - UpperRPPoint( + RPPointFunctor( row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); } else if (thandle_v[i]->get_algorithm() == @@ -2587,7 +2520,7 @@ tstf); } // end elseif SEQLVLSCHD_TP1) { int team_size = thandle_v[i]->get_team_size(); auto tp = team_size == -1 ? team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO) : team_policy(execspace_v[i], lvl_nodes, team_size); - UpperTPPoint + TPPointFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); Kokkos::parallel_for("parfor_l_team", tp, tstf); @@ -2597,7 +2530,7 @@ tstf); } // end elseif } // end if (lvl < nlevels_v[i]) } // end for streams } // end for lvl - } // end upper_tri_solve_streams + } // end tri_solve_streams }; // struct SptrsvWrap diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 2faf27d4a8..477459b7ce 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -202,8 +202,8 @@ struct SPTRSV_SOLVE(execspace_v, sptrsv_handle_v, row_map_v, + entries_v, values_v, b_v, x_v); } else { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { @@ -212,8 +212,8 @@ struct SPTRSV_SOLVE(execspace_v, sptrsv_handle_v, row_map_v, + entries_v, values_v, b_v, x_v); } Kokkos::Profiling::popRegion(); } From 54f0617dcd8581b2f602f20f58211d85cf5f2e3a Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 27 Jun 2024 16:47:00 -0600 Subject: [PATCH 12/41] progres --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 376 ++++++++++++++++-- 1 file changed, 344 insertions(+), 32 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 7a131f2545..f4ab398e90 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -100,11 +100,15 @@ struct SptrsvWrap { void operator()(const int) const {} }; - // This functor unifies the lower and upper implementations, the hope is the - // "is_lowertri" check does not add noticable time on larger problems + /** + * Common base class for sptrsv functors that need to work for both + * point and block matrices. Default version does not support + * blocks + */ template - struct TriLvlSchedTP1SolverFunctor { + class LHSType, class RHSType, bool BlockEnabled> + bool BlockEnabled> + struct Common { RowMapType row_map; EntriesType entries; ValuesType values; @@ -112,6 +116,316 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; + using vec_t = scalar_t; + + static constexpr size_type BUFF_SIZE = 0; + + Common(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, + LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const size_type block_size_) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_) + { + KK_REQUIRE_MSG(block_size_ == 0, + "Tried to use blocks with the unblocked Common?"); + } + + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return 0; } + + // lset + KOKKOS_INLINE_FUNCTION + void lset(const size_type row, const scalar_t value) const { + lhs(row) = value; + } + + // divide. C = lhs / rhs + KOKKOS_INLINE_FUNCTION + void divide(const member_type &team, const scalar_t &lhs, const scalar_t &rhs, + scalar_t& C) const { + Kokkos::single(Kokkos::PerTeam(team), [&]() { C = lhs / rhs; }); + team.team_barrier(); + } + + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + void multiply_subtract(const scalar_t &A, const scalar_t &B, + scalar_t &C) const { + C -= A * B; + } + + // lget + KOKKOS_INLINE_FUNCTION + scalar_t& lget(const size_type nnz) const { return L_values(nnz); } + + // rget + KOKKOS_INLINE_FUNCTION + scalar_t uget(const size_type nnz) const { return U_values(nnz); } + + // vget + KOKKOS_INLINE_FUNCTION + scalar_t vget(const size_type nnz) const { return A_values(nnz); } + + // print + KOKKOS_INLINE_FUNCTION + void print(const scalar_t &item) const { std::cout << item << std::endl; } + }; + + // Partial specialization for block support + template + struct Common { + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + using value_type = typename LValuesType::value_type; + using cvalue_type = typename LValuesType::const_value_type; + + using Block = Kokkos::View< + value_type **, Layout, typename LValuesType::device_type, + Kokkos::MemoryTraits >; + + // const block + using CBlock = Kokkos::View< + cvalue_type **, Layout, typename UValuesType::device_type, + Kokkos::MemoryTraits >; + + // scratch block + using SBlock = Kokkos::View< + value_type **, Layout, typename execution_space::scratch_memory_space, + Kokkos::MemoryTraits >; + + using reftype = Block; + using valtype = Block; + + static constexpr size_type BUFF_SIZE = 128; + + ARowMapType A_row_map; + AEntriesType A_entries; + AValuesType A_values; + LRowMapType L_row_map; + LEntriesType L_entries; + LValuesType L_values; + URowMapType U_row_map; + UEntriesType U_entries; + UValuesType U_values; + LevelViewType level_idx; + WorkViewType iw; + lno_t lev_start; + size_type block_size; + size_type block_items; + + Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const lno_t &lev_start_, + const size_type &block_size_) + : A_row_map(A_row_map_), + A_entries(A_entries_), + A_values(A_values_), + L_row_map(L_row_map_), + L_entries(L_entries_), + L_values(L_values_), + U_row_map(U_row_map_), + U_entries(U_entries_), + U_values(U_values_), + level_idx(level_idx_), + iw(iw_), + lev_start(lev_start_), + block_size(block_size_), + block_items(block_size * block_size) { + KK_REQUIRE_MSG(block_size > 0, + "Tried to use block_size=0 with the blocked Common?"); + KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); + } + + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + + // lset + KOKKOS_INLINE_FUNCTION + void lset(const size_type block, const scalar_t &value) const { + KokkosBlas::SerialSet::invoke(value, lget(block)); + } + + KOKKOS_INLINE_FUNCTION + void lset(const size_type block, const CBlock &rhs) const { + auto lblock = lget(block); + assign(lblock, rhs); + } + + KOKKOS_INLINE_FUNCTION + void uset(const size_type block, const CBlock &rhs) const { + auto ublock = uget(block); + assign(ublock, rhs); + } + + // lset_id + KOKKOS_INLINE_FUNCTION + void lset_id(const member_type &team, const size_type block) const { + KokkosBatched::TeamSetIdentity::invoke(team, lget(block)); + } + + // assign + template + KOKKOS_INLINE_FUNCTION void assign(const ViewT &lhs, + const CBlock &rhs) const { + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + lhs(i, j) = rhs(i, j); + } + } + } + + // divide. lhs /= rhs (lhs = lhs * rhs^-1) + KOKKOS_INLINE_FUNCTION + void divide(const member_type &team, const Block &lhs, const CBlock &rhs, + scalar_t *buff) const { + // Need a temp block to do LU of rhs + Block LU(buff, block_size, block_size); + assign(LU, rhs); + KokkosBatched::TeamLU::invoke(team, LU); + + // rhs = LU + // rhs^-1 = U^-1 * L^-1 + // lhs = (lhs * U^-1) * L^-1, so do U trsm first + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, lhs); + + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, lhs); + } + + // divide_left. lhs /= rhs (lhs = rhs^-1 * lhs) + KOKKOS_INLINE_FUNCTION + void divide_left(const Block &lhs, const CBlock &rhs, + scalar_t *buff) const { + Block LU(buff, block_size, block_size); + assign(LU, rhs); + KokkosBatched::SerialLU::invoke(LU); + + // rhs = LU + // rhs^-1 = U^-1 * L^-1 + // lhs = U^-1 * (L^-1 * lhs), so do L trsm first + KokkosBatched::SerialTrsm< + KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, lhs); + + KokkosBatched::SerialTrsm< + KokkosBatched::Side::Left, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, lhs); + } + + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + void multiply_subtract(const CBlock &A, const CBlock &B, + const Block &C) const { + // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 + KokkosBatched::SerialGemm< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(-1.0, A, B, 1.0, + C); + } + + // lget + KOKKOS_INLINE_FUNCTION + Block lget(const size_type block) const { + return Block(L_values.data() + (block * block_items), block_size, + block_size); + } + + // lcopy + KOKKOS_INLINE_FUNCTION + Block lcopy(const size_type block, scalar_t *buff) const { + Block result(buff, block_size, block_size); + auto lblock = lget(block); + assign(result, lblock); + return result; + } + + // ucopy + KOKKOS_INLINE_FUNCTION + Block ucopy(const size_type block, scalar_t *buff) const { + Block result(buff, block_size, block_size); + auto ublock = uget(block); + assign(result, ublock); + return result; + } + + // uget + KOKKOS_INLINE_FUNCTION + Block uget(const size_type block) const { + return Block(U_values.data() + (block * block_items), block_size, + block_size); + } + + // aget + KOKKOS_INLINE_FUNCTION + CBlock aget(const size_type block) const { + return CBlock(A_values.data() + (block * block_items), block_size, + block_size); + } + + // uequal + KOKKOS_INLINE_FUNCTION + bool uequal(const size_type block, const scalar_t &value) const { + auto u_block = uget(block); + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + if (u_block(i, j) != value) { + return false; + } + } + } + return true; + } + + // print + KOKKOS_INLINE_FUNCTION + void print(const CBlock &item) const { + for (size_type i = 0; i < block_size; ++i) { + std::cout << " "; + for (size_type j = 0; j < block_size; ++j) { + std::cout << item(i, j) << " "; + } + std::cout << std::endl; + } + } + + // report + KOKKOS_INLINE_FUNCTION + void report() const { + std::cout << "JGF using blocked version with block_size=" << block_size + << std::endl; + } + }; + + template + struct TriLvlSchedTP1SolverFunctor : + : public Common + { + using Base = Common; + long node_count; // like "block" offset into ngbl, my_league is the "local" // offset @@ -120,33 +434,29 @@ struct SptrsvWrap { const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, - const long &node_count_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_) {} + const long &node_count_, + const size_type block_size_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_), + node_count(node_count_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { auto my_league = team.league_rank(); // map to rowid auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); auto soffset = row_map(rowid); auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + auto rhs_rowid = Base::rget(rowid); + auto diff = Base::vec_t(0.0); Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { auto colid = entries(ptr); - auto val = values(ptr); if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); + auto val = Base::vget(ptr); + auto lhs_colid = Base::lget(colid); + Base::multiply_subtract(val, lhs_colid, tdiff); } }, diff); @@ -154,16 +464,14 @@ struct SptrsvWrap { team.team_barrier(); // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = IsLower ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); - } + Base::add(diff, rhs_rowid); + auto val = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); + Base::divide(team, diff, val, Base::lget(rowid)); } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const member_type &team) const { + /* auto my_league = team.league_rank(); // map to rowid auto rowid = nodes_grouped_by_level(my_league + node_count); auto my_rank = team.team_rank(); @@ -195,6 +503,7 @@ struct SptrsvWrap { lhs(rowid) = (rhs_rowid + diff) / values(diag); } } + */ }; template +#define FunctorTypeMacro(Functor, IsLower, BlockEnabled) \ + Functor template @@ -1436,18 +1749,17 @@ struct SptrsvWrap { const auto nodes_per_level = thandle.get_nodes_per_level(); const auto hnodes_per_level = thandle.get_host_nodes_per_level(); const auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - //const auto block_size = thandle.get_block_size(); + const auto block_size = thandle.get_block_size(); const auto block_enabled = thandle.is_block_enabled(); - KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported"); - // Set up functor types - using LowerRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true); - //using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, true); - using LowerTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true); - //using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, true); + using LowerRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, false); + using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, true); + using LowerTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, false); + using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, true); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) + KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); using namespace KokkosSparse::Experimental; using device_t = Kokkos::Device; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; From aa8827e5388e00748a359ae9e2c042a82a49c7f5 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 1 Jul 2024 17:01:30 -0600 Subject: [PATCH 13/41] progress --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 470 +++++++++--------- 1 file changed, 233 insertions(+), 237 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index f4ab398e90..70222ff5d5 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -34,6 +34,11 @@ #include "KokkosBlas2_team_gemv_spec.hpp" #include "KokkosBatched_Trsm_Team_Impl.hpp" #endif +#include "KokkosBlas1_team_axpby.hpp" +#include "KokkosBlas1_axpby.hpp" +#include "KokkosBlas1_set.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" //#define SERIAL_FOR_LOOP @@ -107,7 +112,6 @@ struct SptrsvWrap { */ template - bool BlockEnabled> struct Common { RowMapType row_map; EntriesType entries; @@ -116,9 +120,17 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - using vec_t = scalar_t; + using reftype = scalar_t &; - static constexpr size_type BUFF_SIZE = 0; + struct SBlock { + template + KOKKOS_INLINE_FUNCTION SBlock(T, size_type, size_type) {} + + KOKKOS_INLINE_FUNCTION + scalar_t *data() { return nullptr; } + }; + + static constexpr size_type BUFF_SIZE = 1; Common(const RowMapType &row_map_, const EntriesType &entries_, @@ -147,14 +159,33 @@ struct SptrsvWrap { lhs(row) = value; } - // divide. C = lhs / rhs + // add. y += x + KOKKOS_INLINE_FUNCTION + void add(const member_type &team, const scalar_t& x, scalar_t& y) const { + Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); + team.team_barrier(); + } + + // serial add. y += x + KOKKOS_INLINE_FUNCTION + void add(const scalar_t& x, scalar_t& y) const { + y += x; + } + + // divide. b /= A KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, const scalar_t &lhs, const scalar_t &rhs, - scalar_t& C) const { - Kokkos::single(Kokkos::PerTeam(team), [&]() { C = lhs / rhs; }); + void divide(const member_type &team, scalar_t &b, const scalar_t &A, + scalar_t*) const { + Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); team.team_barrier(); } + // serial divide. b /= A + KOKKOS_INLINE_FUNCTION + void divide(scalar_t &b, const scalar_t &A, scalar_t*) const { + b /= A; + } + // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION void multiply_subtract(const scalar_t &A, const scalar_t &B, @@ -164,15 +195,15 @@ struct SptrsvWrap { // lget KOKKOS_INLINE_FUNCTION - scalar_t& lget(const size_type nnz) const { return L_values(nnz); } + scalar_t& lget(const size_type row) const { return lhs(row); } // rget KOKKOS_INLINE_FUNCTION - scalar_t uget(const size_type nnz) const { return U_values(nnz); } + scalar_t rget(const size_type row) const { return rhs(row); } // vget KOKKOS_INLINE_FUNCTION - scalar_t vget(const size_type nnz) const { return A_values(nnz); } + scalar_t vget(const size_type nnz) const { return values(nnz); } // print KOKKOS_INLINE_FUNCTION @@ -181,70 +212,62 @@ struct SptrsvWrap { // Partial specialization for block support template - struct Common { + class LHSType, class RHSType> + struct Common { // BSR data is in LayoutRight! - using Layout = Kokkos::LayoutRight; - using value_type = typename LValuesType::value_type; - using cvalue_type = typename LValuesType::const_value_type; + using Layout = Kokkos::LayoutRight; using Block = Kokkos::View< - value_type **, Layout, typename LValuesType::device_type, + scalar_t **, Layout, typename ValuesType::device_type, Kokkos::MemoryTraits >; // const block using CBlock = Kokkos::View< - cvalue_type **, Layout, typename UValuesType::device_type, + const scalar_t **, Layout, typename ValuesType::device_type, Kokkos::MemoryTraits >; // scratch block using SBlock = Kokkos::View< - value_type **, Layout, typename execution_space::scratch_memory_space, + scalar_t **, Layout, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits >; - using reftype = Block; - using valtype = Block; + using Vector = Kokkos::View< + scalar_t *, Layout, typename ValuesType::device_type, + Kokkos::MemoryTraits >; + + using CVector = Kokkos::View< + const scalar_t *, Layout, typename ValuesType::device_type, + Kokkos::MemoryTraits >; static constexpr size_type BUFF_SIZE = 128; - ARowMapType A_row_map; - AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; - LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; - UEntriesType U_entries; - UValuesType U_values; - LevelViewType level_idx; - WorkViewType iw; - lno_t lev_start; + using reftype = Vector; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; size_type block_size; size_type block_items; - Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, - const AValuesType &A_values_, const LRowMapType &L_row_map_, - const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - WorkViewType &iw_, const lno_t &lev_start_, - const size_type &block_size_) - : A_row_map(A_row_map_), - A_entries(A_entries_), - A_values(A_values_), - L_row_map(L_row_map_), - L_entries(L_entries_), - L_values(L_values_), - U_row_map(U_row_map_), - U_entries(U_entries_), - U_values(U_values_), - level_idx(level_idx_), - iw(iw_), - lev_start(lev_start_), + Common(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, + LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const size_type block_size_) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), block_size(block_size_), - block_items(block_size * block_size) { + block_items(block_size * block_size) + { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); @@ -255,148 +278,119 @@ struct SptrsvWrap { // lset KOKKOS_INLINE_FUNCTION - void lset(const size_type block, const scalar_t &value) const { - KokkosBlas::SerialSet::invoke(value, lget(block)); + void lset(const size_type row, const scalar_t &value) const { + KokkosBlas::SerialSet::invoke(value, lget(row)); } KOKKOS_INLINE_FUNCTION - void lset(const size_type block, const CBlock &rhs) const { - auto lblock = lget(block); - assign(lblock, rhs); + void lset(const size_type row, const CVector &rhs) const { + auto lvec = lget(row); + assign(lvec, rhs); } - KOKKOS_INLINE_FUNCTION - void uset(const size_type block, const CBlock &rhs) const { - auto ublock = uget(block); - assign(ublock, rhs); + // assign + template + KOKKOS_INLINE_FUNCTION void assign(const View1 &lhs, + const View2 &rhs) const { + for (size_type i = 0; i < lhs.size(); ++i) { + lhs.data()[i] = rhs.data()[i]; + } + } + + template + KOKKOS_INLINE_FUNCTION void assign(const member_type &team, + const View1 &lhs, + const View2 &rhs) const { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs.size()), + [&](const size_type i) { + lhs.data()[i] = rhs.data()[i]; + }); } - // lset_id + // add. y += x KOKKOS_INLINE_FUNCTION - void lset_id(const member_type &team, const size_type block) const { - KokkosBatched::TeamSetIdentity::invoke(team, lget(block)); + void add(const member_type &team, const scalar_t& x, scalar_t& y) const { + KokkosBlas::Experimental::axpy(team, 1.0, x, y); } - // assign - template - KOKKOS_INLINE_FUNCTION void assign(const ViewT &lhs, - const CBlock &rhs) const { - for (size_type i = 0; i < block_size; ++i) { - for (size_type j = 0; j < block_size; ++j) { - lhs(i, j) = rhs(i, j); - } - } + // serial add. y += x + KOKKOS_INLINE_FUNCTION + void add(const scalar_t& x, scalar_t& y) const { + KokkosBlas::serial_axpy(1.0, x, y); } - // divide. lhs /= rhs (lhs = lhs * rhs^-1) + // divide. b /= A (b = b * rhs^-1) KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, const Block &lhs, const CBlock &rhs, - scalar_t *buff) const { + void divide(const member_type &team, const Vector &b, const CBlock &A, + scalar_t* buff) const { // Need a temp block to do LU of rhs Block LU(buff, block_size, block_size); - assign(LU, rhs); + assign(team, LU, rhs); KokkosBatched::TeamLU::invoke(team, LU); - // rhs = LU - // rhs^-1 = U^-1 * L^-1 - // lhs = (lhs * U^-1) * L^-1, so do U trsm first + // A = LU + // A^-1 = U^-1 * L^-1 + // b = (b * U^-1) * L^-1, so do U trsm first KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, lhs); + KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, b); KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, lhs); + KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, b); } - // divide_left. lhs /= rhs (lhs = rhs^-1 * lhs) + // serial divide. b /= A (b = b * rhs^-1) KOKKOS_INLINE_FUNCTION - void divide_left(const Block &lhs, const CBlock &rhs, - scalar_t *buff) const { + void divide(const Vector &b, const CBlock &A, scalar_t* buff) const { + // Need a temp block to do LU of rhs Block LU(buff, block_size, block_size); assign(LU, rhs); KokkosBatched::SerialLU::invoke(LU); - // rhs = LU - // rhs^-1 = U^-1 * L^-1 - // lhs = U^-1 * (L^-1 * lhs), so do L trsm first + // A = LU + // A^-1 = U^-1 * L^-1 + // b = (b * U^-1) * L^-1, so do U trsm first KokkosBatched::SerialTrsm< - KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, lhs); + KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, b); KokkosBatched::SerialTrsm< - KokkosBatched::Side::Left, KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, lhs); + KokkosBatched::Side::Right, KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, b); } // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - void multiply_subtract(const CBlock &A, const CBlock &B, - const Block &C) const { + void multiply_subtract(const CBlock &A, const CVector &B, + const Vector &C) const { // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 - KokkosBatched::SerialGemm< - KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(-1.0, A, B, 1.0, - C); + KokkosBatched::SerialGemv< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Algo::Gemm::Blocked>:: + invoke(-1.0, A, B, 1.0, C); } // lget KOKKOS_INLINE_FUNCTION - Block lget(const size_type block) const { - return Block(L_values.data() + (block * block_items), block_size, - block_size); - } - - // lcopy - KOKKOS_INLINE_FUNCTION - Block lcopy(const size_type block, scalar_t *buff) const { - Block result(buff, block_size, block_size); - auto lblock = lget(block); - assign(result, lblock); - return result; - } - - // ucopy - KOKKOS_INLINE_FUNCTION - Block ucopy(const size_type block, scalar_t *buff) const { - Block result(buff, block_size, block_size); - auto ublock = uget(block); - assign(result, ublock); - return result; - } - - // uget - KOKKOS_INLINE_FUNCTION - Block uget(const size_type block) const { - return Block(U_values.data() + (block * block_items), block_size, - block_size); + Vector lget(const size_type row) const { + return Vector(lhs.data() + (row * block_size), block_size); } - // aget + // rget KOKKOS_INLINE_FUNCTION - CBlock aget(const size_type block) const { - return CBlock(A_values.data() + (block * block_items), block_size, - block_size); + CVector rget(const size_type row) const { + return Vector(rhs.data() + (row * block_size), block_size); } - // uequal + // vget KOKKOS_INLINE_FUNCTION - bool uequal(const size_type block, const scalar_t &value) const { - auto u_block = uget(block); - for (size_type i = 0; i < block_size; ++i) { - for (size_type j = 0; j < block_size; ++j) { - if (u_block(i, j) != value) { - return false; - } - } - } - return true; + CBlock vget(const size_type block) const { + return CBlock(values.data() + (block * block_items), block_size, block_size); } // print @@ -410,19 +404,12 @@ struct SptrsvWrap { std::cout << std::endl; } } - - // report - KOKKOS_INLINE_FUNCTION - void report() const { - std::cout << "JGF using blocked version with block_size=" << block_size - << std::endl; - } }; template struct TriLvlSchedTP1SolverFunctor : - : public Common + public Common { using Base = Common; @@ -442,68 +429,74 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); + auto rowid = Base::nodes_grouped_by_level(my_league + node_count); + auto soffset = Base::row_map(rowid); + auto eoffset = Base::row_map(rowid + 1); auto rhs_rowid = Base::rget(rowid); - auto diff = Base::vec_t(0.0); - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), + typename Base::reftype lhs_rowid = Base::lget(rowid); + + // Team-shared buffer. Use for team work. + const auto bs = Base::get_block_size(); + typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); + + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, soffset + (IsLower ? 0 : 1), eoffset - (IsLower ? 1 : 0)), [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - if (colid != rowid) { - auto val = Base::vget(ptr); - auto lhs_colid = Base::lget(colid); - Base::multiply_subtract(val, lhs_colid, tdiff); - } + auto colid = Base::entries(ptr); + KK_KERNEL_ASSERT(colid != rowid); + auto val = Base::vget(ptr); + auto lhs_colid = Base::lget(colid); + // tdiff -= val * lhs_colid; + Base::multiply_subtract(val, lhs_colid, tdiff); }, - diff); + lhs_rowid); team.team_barrier(); // At end, finalize rowid == colid - Base::add(diff, rhs_rowid); - auto val = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); - Base::divide(team, diff, val, Base::lget(rowid)); + Base::add(team, lhs_rowid, rhs_rowid); // lhs_rowid += rhs(rowid) + auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); + // lhs_rowid /= val + Base::divide(team, lhs_rowid, diag, shared_buff.data()); } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const member_type &team) const { - /* auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); + auto rowid = Base::nodes_grouped_by_level(my_league + node_count); auto my_rank = team.team_rank(); + auto soffset = Base::row_map(rowid); + auto eoffset = Base::row_map(rowid + 1); + auto rhs_rowid = Base::rget(rowid); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + typename Base::reftype lhs_rowid = Base::lget(rowid); + + // Team-shared buffer. Use for team work. + const auto bs = Base::get_block_size(); + typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); auto diag = -1; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); + auto colid = Base::entries(ptr); if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); + auto val = Base::vget(ptr); + auto lhs_colid = Base::lget(colid); + // tdiff -= val * lhs_colid; + Base::multiply_subtract(val, lhs_colid, tdiff); } else { diag = ptr; } }, - diff); + lhs_rowid); team.team_barrier(); // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } + Base::add(team, lhs_rowid, rhs_rowid); // lhs_rowid += rhs(rowid) + Base::divide(team, lhs_rowid, Base::vget(diag), shared_buff.data()); } - */ }; template - struct TriLvlSchedRPSolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; + class LHSType, class RHSType, bool IsLower, bool BlockEnabled> + struct TriLvlSchedRPSolverFunctor : + public Common + { + using Base = Common; TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) {} + const entries_t &nodes_grouped_by_level_, + const size_type block_size_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) + {} KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - /* - auto rowid = nodes_grouped_by_level(i); - // Assuming indices are sorted per row, diag entry is final index in the - // list - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - for (long ptr = IsLower ? soffset : eoffset - 1; - (IsLower && (ptr < eoffset)) || (!IsLower && (ptr >= soffset)); ptr+=(IsLower ? 1 : -1)) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - lhs(rowid) = rhs_rowid / val; - } + // Thread-local buffers. Use for Serial (non-team) work + scalar_t buff1[Base::BUFF_SIZE]; + + auto rowid = Base::nodes_grouped_by_level(i); + long soffset = Base::row_map(rowid); + long eoffset = Base::row_map(rowid + 1); + auto rhs_rowid = Base::rhs(rowid); + + typename Base::reftype lhs_rowid = Base::lget(rowid); + + for (long ptr = soffset + (IsLower ? 0 : 1); ptr < eoffset - (IsLower ? 1 : 0); ++ptr) { + auto colid = Base::entries(ptr); + KK_KERNEL_ASSERT(colid != rowid); + auto val = Base::vget(ptr); + auto lhs_colid = Base::lget(colid); + // lhs_rowid -= val * lhs_colid + Base::multiply_subtract(val, lhs_colid, lhs_rowid); } // end for ptr - */ + + Base::add(lhs_rowid, rhs_rowid); + auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); + Base::divide(lhs_rowid, diag, &buff1[0]); } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const lno_t i) const { - /* - auto rowid = nodes_grouped_by_level(i); - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); + // Thread-local buffers. Use for Serial (non-team) work + scalar_t buff1[Base::BUFF_SIZE]; + + auto rowid = Base::nodes_grouped_by_level(i); + long soffset = Base::row_map(rowid); + long eoffset = Base::row_map(rowid + 1); + auto rhs_rowid = Base::rget(rowid); + + typename Base::reftype lhs_rowid = Base::lget(rowid); + auto diag = -1; - for (long ptr = IsLower ? soffset : eoffset - 1; - (IsLower && (ptr < eoffset)) || (!IsLower && (ptr >= soffset)); ptr+=(IsLower ? 1 : -1)) { - auto colid = entries(ptr); - auto val = values(ptr); + for (long ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = Base::entries(ptr); if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); + auto val = Base::values(ptr); + auto lhs_colid = Base::lget(colid); + Base::multiply_subtract(val, lhs_colid, lhs_rowid); } else { diag = ptr; } } // end for ptr - lhs(rowid) = rhs_rowid / values(diag); - */ + Base::add(lhs_rowid, rhs_rowid); // lhs_rowid += rhs(rowid) + Base::divide(lhs_rowid, Base::vget(diag), &buff1[0]); } }; @@ -2147,10 +2143,10 @@ struct SptrsvWrap { KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported"); // Set up functor types - using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false); - //using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, true); - using UpperTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false); - //using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, true); + using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, false); + using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, true); + using UpperTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, false); + using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, true); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -2704,7 +2700,7 @@ tstf); } // end elseif if (echain - schain == 1) { // if team_size is -1 (unset), get recommended size from Kokkos TriLvlSchedTP1SolverFunctor + LHSType, RHSType, IsLower, false> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); if (team_size == -1) { @@ -2790,8 +2786,8 @@ tstf); } // end elseif using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - using RPPointFunctor = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, IsLower); - using TPPointFunctor = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, IsLower); + using RPPointFunctor = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, IsLower, false); + using TPPointFunctor = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, IsLower, false); // Create vectors for handles' data in streams int nstreams = execspace_v.size(); From a7e52f9b6dd8dbda9135fffb65f7a57e7f316c15 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 2 Jul 2024 18:06:40 -0600 Subject: [PATCH 14/41] Progress, test added --- sparse/unit_test/Test_Sparse_sptrsv.hpp | 86 +++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index b8b35bc422..6ffbc4033c 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -74,6 +74,16 @@ struct SptrsvTest { return A; } + static std::vector> get_6x6_ut_ones_fixture() { + std::vector> A = {{1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 1.00, 0.00, 0.00, 0.00, 1.00}, + {0.00, 0.00, 1.00, 1.00, 0.00, 1.00}, + {0.00, 0.00, 0.00, 1.00, 0.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 1.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 0.00, 1.00}}; + return A; + } + static std::vector> get_5x5_ut_fixture() { const auto KZ = KEEP_ZERO(); std::vector> A = {{5.00, 1.00, 1.00, 0.00, KZ}, @@ -103,6 +113,17 @@ struct SptrsvTest { return A; } + static std::vector> get_6x6_lt_ones_fixture() { + std::vector> A = {{1.00, 0.00, 0.00, 0.00, 0.00, 0.00}, + {1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 1.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 1.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 1.00, 1.00, 0.00}, + {0.00, 1.00, 1.00, 1.00, 1.00, 1.00}}; + return A; + } + + struct ReductionCheck { ValuesType lhs; @@ -629,6 +650,70 @@ struct SptrsvTest { } } + static void run_test_sptrsv_blocks_impl(const bool is_lower, const size_type block_size) { + constexpr scalar_t ZERO = scalar_t(0); + constexpr scalar_t ONE = scalar_t(1); + + RowMapType row_map; + EntriesType entries; + ValuesType values; + + auto fixture = is_lower ? get_6x6_lt_ones_fixture() : get_6x6_ut_ones_fixture(); + + compress_matrix(row_map, entries, values, fixture); + + const size_type nrows = row_map.size() - 1; + const size_type nnz = values.size(); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + ValuesType lhs("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); + + Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + + // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2, currently unavailable + for (auto alg : {SPTRSVAlgorithm::SEQLVLSCHD_TP1, SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN}) { + KernelHandle kh; + kh.create_sptrsv_handle(alg, nrows, is_lower); + + if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + } + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + Kokkos::deep_copy(lhs, ZERO); + + kh.destroy_sptrsv_handle(); + } + } + + static void run_test_sptrsv_blocks() { + for (size_type block_size : {1}) { + run_test_sptrsv_blocks_impl(true, block_size); + run_test_sptrsv_blocks_impl(false, block_size); + } + } + static void run_test_sptrsv_streams(int test_algo, int nstreams) { // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition @@ -817,6 +902,7 @@ template ; TestStruct::run_test_sptrsv(); + TestStruct::run_test_sptrsv_blocks(); } template Date: Tue, 2 Jul 2024 20:39:27 -0600 Subject: [PATCH 15/41] progress and fixes --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 20 ++++++------ sparse/unit_test/Test_Sparse_sptrsv.hpp | 31 ++++++++++++------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 70222ff5d5..441639e6b5 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -28,12 +28,12 @@ #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV // Enable supernodal sptrsv -#include "KokkosBlas3_trsm.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosBatched_Util.hpp" #include "KokkosBlas2_team_gemv_spec.hpp" -#include "KokkosBatched_Trsm_Team_Impl.hpp" #endif +#include "KokkosBlas3_trsm.hpp" +#include "KokkosBatched_Trsm_Team_Impl.hpp" #include "KokkosBlas1_team_axpby.hpp" #include "KokkosBlas1_axpby.hpp" #include "KokkosBlas1_set.hpp" @@ -454,7 +454,7 @@ struct SptrsvWrap { team.team_barrier(); // At end, finalize rowid == colid - Base::add(team, lhs_rowid, rhs_rowid); // lhs_rowid += rhs(rowid) + Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); // lhs_rowid /= val Base::divide(team, lhs_rowid, diag, shared_buff.data()); @@ -494,7 +494,7 @@ struct SptrsvWrap { team.team_barrier(); // At end, finalize rowid == colid - Base::add(team, lhs_rowid, rhs_rowid); // lhs_rowid += rhs(rowid) + Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) Base::divide(team, lhs_rowid, Base::vget(diag), shared_buff.data()); } }; @@ -713,7 +713,7 @@ struct SptrsvWrap { Base::multiply_subtract(val, lhs_colid, lhs_rowid); } // end for ptr - Base::add(lhs_rowid, rhs_rowid); + Base::add(rhs_rowid, lhs_rowid); auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); Base::divide(lhs_rowid, diag, &buff1[0]); } @@ -741,7 +741,7 @@ struct SptrsvWrap { diag = ptr; } } // end for ptr - Base::add(lhs_rowid, rhs_rowid); // lhs_rowid += rhs(rowid) + Base::add(rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) Base::divide(lhs_rowid, Base::vget(diag), &buff1[0]); } }; @@ -1755,7 +1755,6 @@ struct SptrsvWrap { using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, true); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); using namespace KokkosSparse::Experimental; using device_t = Kokkos::Device; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; @@ -1885,6 +1884,7 @@ struct SptrsvWrap { else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree size_t flops = 0; @@ -2033,6 +2033,7 @@ struct SptrsvWrap { SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { + KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree Kokkos::Timer timer; timer.reset(); @@ -2140,8 +2141,6 @@ struct SptrsvWrap { //const auto block_size = thandle.get_block_size(); const auto block_enabled = thandle.is_block_enabled(); - KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported"); - // Set up functor types using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, false); using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, true); @@ -2276,6 +2275,7 @@ tstf); } // end elseif else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree size_t flops = 0; @@ -2529,6 +2529,8 @@ tstf); } // end elseif SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { + KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); + #ifdef profile_supernodal_etree Kokkos::Timer timer; timer.reset(); diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 6ffbc4033c..303e4c13f5 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -654,36 +654,43 @@ struct SptrsvTest { constexpr scalar_t ZERO = scalar_t(0); constexpr scalar_t ONE = scalar_t(1); - RowMapType row_map; - EntriesType entries; - ValuesType values; + RowMapType point_row_map; + EntriesType point_entries; + ValuesType point_values; auto fixture = is_lower ? get_6x6_lt_ones_fixture() : get_6x6_ut_ones_fixture(); - compress_matrix(row_map, entries, values, fixture); + compress_matrix(point_row_map, point_entries, point_values, fixture); - const size_type nrows = row_map.size() - 1; - const size_type nnz = values.size(); + const size_type point_nrows = point_row_map.size() - 1; + const size_type point_nnz = point_values.size(); // Create known_lhs, generate rhs, then solve for lhs to compare to // known_lhs - ValuesType known_lhs("known_lhs", nrows); + ValuesType known_lhs("known_lhs", point_nrows); // Create known solution lhs set to all 1's Kokkos::deep_copy(known_lhs, ONE); // Solution to find - ValuesType lhs("lhs", nrows); + ValuesType lhs("lhs", point_nrows); // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); + ValuesType rhs("rhs", point_nrows); - Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + Crs triMtx_crs("triMtx", point_nrows, point_nrows, point_nnz, point_values, point_row_map, point_entries); + Bsr triMtx(triMtx_crs, block_size); KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + auto row_map = triMtx.graph.row_map; + auto entries = triMtx.graph.entries; + auto values = triMtx.values; + + const size_type nrows = row_map.size() - 1; + // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2, currently unavailable for (auto alg : {SPTRSVAlgorithm::SEQLVLSCHD_TP1, SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN}) { KernelHandle kh; - kh.create_sptrsv_handle(alg, nrows, is_lower); + kh.create_sptrsv_handle(alg, nrows, is_lower, block_size); if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { auto chain_threshold = 1; @@ -708,7 +715,7 @@ struct SptrsvTest { } static void run_test_sptrsv_blocks() { - for (size_type block_size : {1}) { + for (size_type block_size : {1, 2, 3}) { run_test_sptrsv_blocks_impl(true, block_size); run_test_sptrsv_blocks_impl(false, block_size); } From ef1025faed6b9778c486bb16227a580cb0d1eb23 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 2 Jul 2024 21:14:19 -0600 Subject: [PATCH 16/41] prog --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 29 +++++++++---------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 25 +++++++++++++--- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 441639e6b5..6022161919 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -1728,7 +1728,7 @@ struct SptrsvWrap { #define FunctorTypeMacro(Functor, IsLower, BlockEnabled) \ Functor - template static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, @@ -1747,12 +1747,11 @@ struct SptrsvWrap { const auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); const auto block_size = thandle.get_block_size(); const auto block_enabled = thandle.is_block_enabled(); + assert(block_enabled == BlockEnabled); // Set up functor types - using LowerRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, false); - using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, true); - using LowerTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, false); - using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, true); + using LowerRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, BlockEnabled); + using LowerTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, BlockEnabled); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -1818,7 +1817,7 @@ struct SptrsvWrap { #endif if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - LowerRPPoint lrpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level); + LowerRPFunc lrpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, block_size); Kokkos::parallel_for( "parfor_fixed_lvl", @@ -1829,7 +1828,7 @@ struct SptrsvWrap { } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - LowerTPPoint ltpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); + LowerTPFunc ltpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); Kokkos::parallel_for( @@ -2116,10 +2115,9 @@ struct SptrsvWrap { std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl << std::endl; #endif - } // end lower_tri_solve - template static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, @@ -2138,14 +2136,13 @@ struct SptrsvWrap { auto nodes_per_level = thandle.get_nodes_per_level(); auto hnodes_per_level = thandle.get_host_nodes_per_level(); auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - //const auto block_size = thandle.get_block_size(); + const auto block_size = thandle.get_block_size(); const auto block_enabled = thandle.is_block_enabled(); + assert(block_enabled == BlockEnabled); // Set up functor types - using UpperRPPoint = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, false); - using LowerRPBlock = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, true); - using UpperTPPoint = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, false); - using LowerTPBlock = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, true); + using UpperRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, BlockEnabled); + using UpperTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, BlockEnabled); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -2211,7 +2208,7 @@ struct SptrsvWrap { if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - UpperRPPoint urpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level); + UpperRPFunc urpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, block_size); Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( @@ -2221,7 +2218,7 @@ struct SptrsvWrap { } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - UpperTPPoint utpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); + UpperTPFunc utpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); Kokkos::parallel_for( diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 477459b7ce..d4418b7bf8 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -125,6 +125,7 @@ struct SPTRSV_SOLVEget_sptrsv_handle(); + const auto block_enabled = sptrsv_handle->is_block_enabled(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() ? "KokkosSparse_sptrsv[lower]" : "KokkosSparse_sptrsv[upper]"); @@ -146,8 +147,16 @@ struct SPTRSV_SOLVE(space, *sptrsv_handle, row_map, entries, + values, b, x); + } + else { + Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map, entries, + values, b, x); + } + } } } else { if (sptrsv_handle->is_symbolic_complete() == false) { @@ -167,8 +176,16 @@ struct SPTRSV_SOLVE(space, *sptrsv_handle, row_map, entries, + values, b, x); + } + else { + Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map, entries, + values, b, x); + } + } } } Kokkos::Profiling::popRegion(); From b6c95b3bcc0fd6b3b27fe93a636fc5945589b89a Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 4 Jul 2024 13:00:08 -0600 Subject: [PATCH 17/41] prog but broken --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 209 ++++++++++++++---- 1 file changed, 162 insertions(+), 47 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 6022161919..f397308c96 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -121,6 +121,8 @@ struct SptrsvWrap { entries_t nodes_grouped_by_level; using reftype = scalar_t &; + using ArrayType = reftype; + using SumArray = reftype; struct SBlock { template @@ -161,35 +163,35 @@ struct SptrsvWrap { // add. y += x KOKKOS_INLINE_FUNCTION - void add(const member_type &team, const scalar_t& x, scalar_t& y) const { + static void add(const member_type &team, const scalar_t& x, scalar_t& y) { Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); team.team_barrier(); } // serial add. y += x KOKKOS_INLINE_FUNCTION - void add(const scalar_t& x, scalar_t& y) const { + static void add(const scalar_t& x, scalar_t& y) { y += x; } // divide. b /= A KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, scalar_t &b, const scalar_t &A, - scalar_t*) const { + static void divide(const member_type &team, scalar_t &b, const scalar_t &A, + scalar_t*) { Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); team.team_barrier(); } // serial divide. b /= A KOKKOS_INLINE_FUNCTION - void divide(scalar_t &b, const scalar_t &A, scalar_t*) const { + static void divide(scalar_t &b, const scalar_t &A, scalar_t*) { b /= A; } // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - void multiply_subtract(const scalar_t &A, const scalar_t &B, - scalar_t &C) const { + static void multiply_subtract(const scalar_t &A, const scalar_t &B, + scalar_t &C) { C -= A * B; } @@ -207,7 +209,11 @@ struct SptrsvWrap { // print KOKKOS_INLINE_FUNCTION - void print(const scalar_t &item) const { std::cout << item << std::endl; } + static void print(const scalar_t &item) { std::cout << item << std::endl; } + + KOKKOS_INLINE_FUNCTION + static void print(ArrayType rhs, const int) { std::cout << rhs << std::endl; } + }; // Partial specialization for block support @@ -243,6 +249,70 @@ struct SptrsvWrap { using reftype = Vector; + struct ArrayType + { + scalar_t m_data[BUFF_SIZE]; + + KOKKOS_INLINE_FUNCTION + ArrayType() { init(); } + + KOKKOS_INLINE_FUNCTION + ArrayType(const ArrayType& rhs) { + for (int i = 0; i < BUFF_SIZE; ++i) m_data[i] = rhs.m_data[i]; + } + + KOKKOS_INLINE_FUNCTION + ArrayType(const Vector&) { init(); } + + KOKKOS_INLINE_FUNCTION + void init() { + for (int i = 0; i < BUFF_SIZE; ++i) m_data[i] = 0; + } + + KOKKOS_INLINE_FUNCTION + ArrayType& operator +=(const ArrayType& rhs) { + for (int i = 0; i < BUFF_SIZE; ++i) m_data[i] += rhs.m_data[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION + ArrayType& operator +=(const values_t& rhs) { + for (int i = 0; i < rhs.size(); ++i) m_data[i] += rhs(i); + return *this; + } + }; + + struct SumArray + { + using reducer = SumArray; + using value_type = ArrayType; + using result_view_type = Kokkos::View; + + private: + value_type& m_value; + + public: + KOKKOS_INLINE_FUNCTION + SumArray(value_type& value) : m_value(value) {} + + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest += src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { val.init(); } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return m_value; } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return result_view_type(&m_value, 1); } + + KOKKOS_INLINE_FUNCTION + bool reference_scalar() const { return true; } + }; + RowMapType row_map; EntriesType entries; ValuesType values; @@ -290,17 +360,17 @@ struct SptrsvWrap { // assign template - KOKKOS_INLINE_FUNCTION void assign(const View1 &lhs, - const View2 &rhs) const { + KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs, + const View2 &rhs) { for (size_type i = 0; i < lhs.size(); ++i) { lhs.data()[i] = rhs.data()[i]; } } template - KOKKOS_INLINE_FUNCTION void assign(const member_type &team, + KOKKOS_INLINE_FUNCTION static void assign(const member_type &team, const View1 &lhs, - const View2 &rhs) const { + const View2 &rhs) { Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs.size()), [&](const size_type i) { lhs.data()[i] = rhs.data()[i]; @@ -309,23 +379,24 @@ struct SptrsvWrap { // add. y += x KOKKOS_INLINE_FUNCTION - void add(const member_type &team, const scalar_t& x, scalar_t& y) const { + static void add(const member_type &team, const CVector& x, const Vector& y) { KokkosBlas::Experimental::axpy(team, 1.0, x, y); } // serial add. y += x KOKKOS_INLINE_FUNCTION - void add(const scalar_t& x, scalar_t& y) const { + static void add(const CVector& x, const Vector& y) { KokkosBlas::serial_axpy(1.0, x, y); } // divide. b /= A (b = b * rhs^-1) KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, const Vector &b, const CBlock &A, - scalar_t* buff) const { - // Need a temp block to do LU of rhs + static void divide(const member_type &team, const Vector &b, const CBlock &A, + scalar_t* buff) { + // Need a temp block to do LU of A + const auto block_size = b.size(); Block LU(buff, block_size, block_size); - assign(team, LU, rhs); + assign(team, LU, A); KokkosBatched::TeamLU::invoke(team, LU); @@ -345,10 +416,11 @@ struct SptrsvWrap { // serial divide. b /= A (b = b * rhs^-1) KOKKOS_INLINE_FUNCTION - void divide(const Vector &b, const CBlock &A, scalar_t* buff) const { - // Need a temp block to do LU of rhs + static void divide(const Vector &b, const CBlock &A, scalar_t* buff) { + // Need a temp block to do LU of A + const auto block_size = b.size(); Block LU(buff, block_size, block_size); - assign(LU, rhs); + assign(LU, A); KokkosBatched::SerialLU::invoke(LU); // A = LU @@ -367,9 +439,10 @@ struct SptrsvWrap { // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - void multiply_subtract(const CBlock &A, const CVector &B, - const Vector &C) const { + static void multiply_subtract(const CBlock &A, const CVector &B, + ArrayType &Ca) { // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 + Vector C(&Ca.m_data[0], B.size()); KokkosBatched::SerialGemv< KokkosBatched::Trans::NoTranspose, KokkosBatched::Algo::Gemm::Blocked>:: invoke(-1.0, A, B, 1.0, C); @@ -384,7 +457,7 @@ struct SptrsvWrap { // rget KOKKOS_INLINE_FUNCTION CVector rget(const size_type row) const { - return Vector(rhs.data() + (row * block_size), block_size); + return CVector(rhs.data() + (row * block_size), block_size); } // vget @@ -395,14 +468,33 @@ struct SptrsvWrap { // print KOKKOS_INLINE_FUNCTION - void print(const CBlock &item) const { - for (size_type i = 0; i < block_size; ++i) { + static void print(const CBlock &item) { + for (size_type i = 0; i < item.extent(0); ++i) { std::cout << " "; - for (size_type j = 0; j < block_size; ++j) { + for (size_type j = 0; j < item.extent(1); ++j) { std::cout << item(i, j) << " "; } std::cout << std::endl; } + } + + // print + KOKKOS_INLINE_FUNCTION + static void print(const CVector &item) { + for (size_type i = 0; i < item.extent(0); ++i) { + std::cout << item(i) << " "; + } + std::cout << std::endl; + } + + KOKKOS_INLINE_FUNCTION + static void print(const ArrayType& rhs, const int block_size) + { + std::cout << "Array: "; + for (int i = 0; i < block_size; ++i) { + std::cout << rhs.m_data[i] << " "; + } + std::cout << std::endl; } }; @@ -426,33 +518,48 @@ struct SptrsvWrap { : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_), node_count(node_count_) {} + struct ReduceFunctor + { + const Base* m_obj; + + using accum_t = std::conditional_t; + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, accum_t& accum) const + { + auto colid = m_obj->entries(i); + auto val = m_obj->vget(i); + auto lhs_colid = m_obj->lget(colid); + // accum -= val * lhs_colid; + Base::multiply_subtract(val, lhs_colid, accum); + } + }; + KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = Base::nodes_grouped_by_level(my_league + node_count); - auto soffset = Base::row_map(rowid); - auto eoffset = Base::row_map(rowid + 1); - auto rhs_rowid = Base::rget(rowid); + using reduce_item = typename Base::ArrayType; + using reducer = typename Base::SumArray; - typename Base::reftype lhs_rowid = Base::lget(rowid); + const auto my_league = team.league_rank(); // map to rowid + const auto rowid = Base::nodes_grouped_by_level(my_league + node_count); + const auto soffset = Base::row_map(rowid); + const auto eoffset = Base::row_map(rowid + 1); + const auto rhs_rowid = Base::rget(rowid); - // Team-shared buffer. Use for team work. - const auto bs = Base::get_block_size(); - typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); + reduce_item reduce = Base::lget(rowid); + ReduceFunctor rf {this}; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, soffset + (IsLower ? 0 : 1), eoffset - (IsLower ? 1 : 0)), - [&](const long ptr, scalar_t &tdiff) { - auto colid = Base::entries(ptr); - KK_KERNEL_ASSERT(colid != rowid); - auto val = Base::vget(ptr); - auto lhs_colid = Base::lget(colid); - // tdiff -= val * lhs_colid; - Base::multiply_subtract(val, lhs_colid, tdiff); - }, - lhs_rowid); - + rf, reducer(reduce)); team.team_barrier(); + // Team-shared buffer. Use for team work. + typename Base::reftype lhs_rowid = Base::lget(rowid); + const auto bs = Base::get_block_size(); + Base::print(reduce, bs); + Base::print(lhs_rowid); + typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); + // At end, finalize rowid == colid Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); @@ -462,6 +569,7 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const member_type &team) const { + /* auto my_league = team.league_rank(); // map to rowid auto rowid = Base::nodes_grouped_by_level(my_league + node_count); auto my_rank = team.team_rank(); @@ -479,7 +587,7 @@ struct SptrsvWrap { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { + [&](const long ptr, typename Base::reftype tdiff) { auto colid = Base::entries(ptr); if (colid != rowid) { auto val = Base::vget(ptr); @@ -496,6 +604,7 @@ struct SptrsvWrap { // At end, finalize rowid == colid Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) Base::divide(team, lhs_rowid, Base::vget(diag), shared_buff.data()); + */ } }; @@ -694,9 +803,12 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { + /* // Thread-local buffers. Use for Serial (non-team) work scalar_t buff1[Base::BUFF_SIZE]; + std::cout << "JGF TriLvlSchedRPSolverFunctor i=" << i << ", block_size=" << Base::get_block_size() << " " << BlockEnabled << std::endl; + auto rowid = Base::nodes_grouped_by_level(i); long soffset = Base::row_map(rowid); long eoffset = Base::row_map(rowid + 1); @@ -716,10 +828,12 @@ struct SptrsvWrap { Base::add(rhs_rowid, lhs_rowid); auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); Base::divide(lhs_rowid, diag, &buff1[0]); + */ } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const lno_t i) const { + /* // Thread-local buffers. Use for Serial (non-team) work scalar_t buff1[Base::BUFF_SIZE]; @@ -743,6 +857,7 @@ struct SptrsvWrap { } // end for ptr Base::add(rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) Base::divide(lhs_rowid, Base::vget(diag), &buff1[0]); + */ } }; From c5b65444db4b36981b35e1af53b7720b9bb3dffd Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sun, 7 Jul 2024 16:36:48 -0600 Subject: [PATCH 18/41] Prog with debug prints --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 75 ++++++++++++++++--- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index f397308c96..00ab494538 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -38,7 +38,7 @@ #include "KokkosBlas1_axpby.hpp" #include "KokkosBlas1_set.hpp" #include "KokkosBatched_LU_Decl.hpp" -#include "KokkosBatched_Gemv_Decl.hpp" +#include "KokkosBlas2_serial_gemv_impl.hpp" //#define SERIAL_FOR_LOOP @@ -130,6 +130,8 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION scalar_t *data() { return nullptr; } + + static int shmem_size(size_type, size_type) { return 0; } }; static constexpr size_type BUFF_SIZE = 1; @@ -164,7 +166,9 @@ struct SptrsvWrap { // add. y += x KOKKOS_INLINE_FUNCTION static void add(const member_type &team, const scalar_t& x, scalar_t& y) { + scalar_t orig = y; Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); + std::cout << "y(" << y << ") = y(" << orig << ") + x(" << x << ")" << std::endl; team.team_barrier(); } @@ -178,7 +182,9 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void divide(const member_type &team, scalar_t &b, const scalar_t &A, scalar_t*) { + scalar_t orig = b; Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); + std::cout << "b(" << b << ") = b(" << orig << ") / A(" << A << ")" << std::endl; team.team_barrier(); } @@ -192,9 +198,14 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void multiply_subtract(const scalar_t &A, const scalar_t &B, scalar_t &C) { + scalar_t orig = C; C -= A * B; + std::cout << "C(" << C << ") = C(" << orig << ") - A(" << A << ") * B(" << B << ")" << std::endl; } + KOKKOS_INLINE_FUNCTION + static void copy(const member_type&, scalar_t&, const scalar_t&) {} + // lget KOKKOS_INLINE_FUNCTION scalar_t& lget(const size_type row) const { return lhs(row); } @@ -380,6 +391,13 @@ struct SptrsvWrap { // add. y += x KOKKOS_INLINE_FUNCTION static void add(const member_type &team, const CVector& x, const Vector& y) { + scalar_t orig = y(0); + Kokkos::single(Kokkos::PerTeam(team), [&]() { y(0) += x(0); }); + team.team_barrier(); + std::cout << "y(" << y(0) << ") = y(" << orig << ") + x(" << x(0) << ")" << std::endl; + + return; + KokkosBlas::Experimental::axpy(team, 1.0, x, y); } @@ -393,6 +411,12 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void divide(const member_type &team, const Vector &b, const CBlock &A, scalar_t* buff) { + scalar_t orig = b(0); + Kokkos::single(Kokkos::PerTeam(team), [&]() { b(0) /= A(0,0); }); + team.team_barrier(); + std::cout << "b(" << b(0) << ") = b(" << orig << ") / A(" << A(0,0) << ")" << std::endl; + return; + // Need a temp block to do LU of A const auto block_size = b.size(); Block LU(buff, block_size, block_size); @@ -441,11 +465,22 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void multiply_subtract(const CBlock &A, const CVector &B, ArrayType &Ca) { + scalar_t orig = Ca.m_data[0]; + Ca.m_data[0] -= A(0,0) * B(0); + std::cout << "C(" << Ca.m_data[0] << ") = C(" << orig << ") - A(" << A(0,0) << ") * B(" << B(0) << ")" << std::endl; + return; + // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 Vector C(&Ca.m_data[0], B.size()); - KokkosBatched::SerialGemv< - KokkosBatched::Trans::NoTranspose, KokkosBatched::Algo::Gemm::Blocked>:: - invoke(-1.0, A, B, 1.0, C); + KokkosBlas::SerialGemv< + KokkosBlas::Trans::NoTranspose, KokkosBlas::Algo::Gemv::Blocked>:: + invoke(-1.0, A, B, 1.0, C); + } + + KOKKOS_INLINE_FUNCTION + static void copy(const member_type &team, const Vector& lhs, ArrayType& rhsa) { + CVector rhs(&rhsa.m_data[0], lhs.size()); + assign(team, lhs, rhs); } // lget @@ -469,6 +504,7 @@ struct SptrsvWrap { // print KOKKOS_INLINE_FUNCTION static void print(const CBlock &item) { + std::cout << "Block: "; for (size_type i = 0; i < item.extent(0); ++i) { std::cout << " "; for (size_type j = 0; j < item.extent(1); ++j) { @@ -481,6 +517,7 @@ struct SptrsvWrap { // print KOKKOS_INLINE_FUNCTION static void print(const CVector &item) { + std::cout << "Vector: "; for (size_type i = 0; i < item.extent(0); ++i) { std::cout << item(i) << " "; } @@ -496,6 +533,17 @@ struct SptrsvWrap { } std::cout << std::endl; } + + KOKKOS_INLINE_FUNCTION + static void print(const SumArray& rhs, const int block_size) + { + std::cout << "SumArray: "; + for (int i = 0; i < block_size; ++i) { + std::cout << rhs.reference().m_data[i] << " "; + } + std::cout << std::endl; + } + }; template lget(colid); // accum -= val * lhs_colid; Base::multiply_subtract(val, lhs_colid, accum); + std::cout << " For i=" << i << ", accum: "; Base::print(accum, 1); } }; @@ -546,25 +595,29 @@ struct SptrsvWrap { const auto eoffset = Base::row_map(rowid + 1); const auto rhs_rowid = Base::rget(rowid); - reduce_item reduce = Base::lget(rowid); + std::cout << "operator() for rowid: " << rowid << std::endl; + + typename Base::reftype lhs_rowid = Base::lget(rowid); + reduce_item reduce = lhs_rowid; ReduceFunctor rf {this}; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, soffset + (IsLower ? 0 : 1), eoffset - (IsLower ? 1 : 0)), rf, reducer(reduce)); team.team_barrier(); + Base::copy(team, lhs_rowid, reduce); + // Team-shared buffer. Use for team work. - typename Base::reftype lhs_rowid = Base::lget(rowid); const auto bs = Base::get_block_size(); - Base::print(reduce, bs); - Base::print(lhs_rowid); + std::cout << "Reduction: "; Base::print(reduce, bs); typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); // At end, finalize rowid == colid Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); - // lhs_rowid /= val + // lhs_rowid /= diag Base::divide(team, lhs_rowid, diag, shared_buff.data()); + std::cout << "lhs: "; Base::print(lhs_rowid); } KOKKOS_INLINE_FUNCTION @@ -1946,6 +1999,8 @@ struct SptrsvWrap { LowerTPFunc ltpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + const int scratch_size = LowerTPFunc::SBlock::shmem_size(block_size, block_size); + tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_l_team", Kokkos::Experimental::require( @@ -2336,6 +2391,8 @@ struct SptrsvWrap { UpperTPFunc utpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + const int scratch_size = UpperTPFunc::SBlock::shmem_size(block_size, block_size); + tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_u_team", Kokkos::Experimental::require( From c6714b43b3c9dea7112d98112f5d9c7e5338cd8e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sun, 7 Jul 2024 16:39:33 -0600 Subject: [PATCH 19/41] Remove extreme debug printing --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 00ab494538..fe5686dcdb 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -168,7 +168,6 @@ struct SptrsvWrap { static void add(const member_type &team, const scalar_t& x, scalar_t& y) { scalar_t orig = y; Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); - std::cout << "y(" << y << ") = y(" << orig << ") + x(" << x << ")" << std::endl; team.team_barrier(); } @@ -184,7 +183,6 @@ struct SptrsvWrap { scalar_t*) { scalar_t orig = b; Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); - std::cout << "b(" << b << ") = b(" << orig << ") / A(" << A << ")" << std::endl; team.team_barrier(); } @@ -200,7 +198,6 @@ struct SptrsvWrap { scalar_t &C) { scalar_t orig = C; C -= A * B; - std::cout << "C(" << C << ") = C(" << orig << ") - A(" << A << ") * B(" << B << ")" << std::endl; } KOKKOS_INLINE_FUNCTION @@ -391,13 +388,6 @@ struct SptrsvWrap { // add. y += x KOKKOS_INLINE_FUNCTION static void add(const member_type &team, const CVector& x, const Vector& y) { - scalar_t orig = y(0); - Kokkos::single(Kokkos::PerTeam(team), [&]() { y(0) += x(0); }); - team.team_barrier(); - std::cout << "y(" << y(0) << ") = y(" << orig << ") + x(" << x(0) << ")" << std::endl; - - return; - KokkosBlas::Experimental::axpy(team, 1.0, x, y); } @@ -411,12 +401,6 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void divide(const member_type &team, const Vector &b, const CBlock &A, scalar_t* buff) { - scalar_t orig = b(0); - Kokkos::single(Kokkos::PerTeam(team), [&]() { b(0) /= A(0,0); }); - team.team_barrier(); - std::cout << "b(" << b(0) << ") = b(" << orig << ") / A(" << A(0,0) << ")" << std::endl; - return; - // Need a temp block to do LU of A const auto block_size = b.size(); Block LU(buff, block_size, block_size); @@ -465,11 +449,6 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void multiply_subtract(const CBlock &A, const CVector &B, ArrayType &Ca) { - scalar_t orig = Ca.m_data[0]; - Ca.m_data[0] -= A(0,0) * B(0); - std::cout << "C(" << Ca.m_data[0] << ") = C(" << orig << ") - A(" << A(0,0) << ") * B(" << B(0) << ")" << std::endl; - return; - // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 Vector C(&Ca.m_data[0], B.size()); KokkosBlas::SerialGemv< From e45fc061f1425f59db3f95a103771097222c885e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sun, 7 Jul 2024 17:10:17 -0600 Subject: [PATCH 20/41] works --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index fe5686dcdb..e5b7843aaa 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -33,6 +33,7 @@ #include "KokkosBlas2_team_gemv_spec.hpp" #endif #include "KokkosBlas3_trsm.hpp" +#include "KokkosBatched_Trsv_Decl.hpp" #include "KokkosBatched_Trsm_Team_Impl.hpp" #include "KokkosBlas1_team_axpby.hpp" #include "KokkosBlas1_axpby.hpp" @@ -410,16 +411,16 @@ struct SptrsvWrap { // A = LU // A^-1 = U^-1 * L^-1 - // b = (b * U^-1) * L^-1, so do U trsm first - KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, + // b = (b * U^-1) * L^-1, so do U trsv first + KokkosBatched::TeamTrsv< + member_type, KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, b); + KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); - KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Lower, + KokkosBatched::TeamTrsv< + member_type, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, b); + KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); } // serial divide. b /= A (b = b * rhs^-1) @@ -433,16 +434,16 @@ struct SptrsvWrap { // A = LU // A^-1 = U^-1 * L^-1 - // b = (b * U^-1) * L^-1, so do U trsm first - KokkosBatched::SerialTrsm< - KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, + // b = (b * U^-1) * L^-1, so do U trsv first + KokkosBatched::SerialTrsv< + KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, b); + KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); - KokkosBatched::SerialTrsm< - KokkosBatched::Side::Right, KokkosBatched::Uplo::Lower, + KokkosBatched::SerialTrsv< + KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, b); + KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); } // multiply_subtract. C -= A * B @@ -593,8 +594,10 @@ struct SptrsvWrap { // At end, finalize rowid == colid Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) + std::cout << "lhs: "; Base::print(lhs_rowid); auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); // lhs_rowid /= diag + std::cout << "diag: "; Base::print(diag); Base::divide(team, lhs_rowid, diag, shared_buff.data()); std::cout << "lhs: "; Base::print(lhs_rowid); } From 1d5ab884f16e94b380845c5a403209556f05bf3c Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sun, 7 Jul 2024 18:21:48 -0600 Subject: [PATCH 21/41] all working --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 103 +++++++++--------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index e5b7843aaa..6f2303e360 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -450,8 +450,14 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void multiply_subtract(const CBlock &A, const CVector &B, ArrayType &Ca) { - // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 Vector C(&Ca.m_data[0], B.size()); + multiply_subtract(A, B, C); + } + + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const CBlock &A, const CVector &B, + Vector &C) { + // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 KokkosBlas::SerialGemv< KokkosBlas::Trans::NoTranspose, KokkosBlas::Algo::Gemv::Blocked>:: invoke(-1.0, A, B, 1.0, C); @@ -546,21 +552,24 @@ struct SptrsvWrap { : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_), node_count(node_count_) {} + template struct ReduceFunctor { const Base* m_obj; + size_type rowid; using accum_t = std::conditional_t; KOKKOS_INLINE_FUNCTION void operator()(size_type i, accum_t& accum) const { - auto colid = m_obj->entries(i); - auto val = m_obj->vget(i); - auto lhs_colid = m_obj->lget(colid); - // accum -= val * lhs_colid; - Base::multiply_subtract(val, lhs_colid, accum); - std::cout << " For i=" << i << ", accum: "; Base::print(accum, 1); + const size_type colid = m_obj->entries(i); + if (!AvoidDiag || colid != rowid) { + auto val = m_obj->vget(i); + auto lhs_colid = m_obj->lget(colid); + // accum -= val * lhs_colid; + Base::multiply_subtract(val, lhs_colid, accum); + } } }; @@ -575,11 +584,9 @@ struct SptrsvWrap { const auto eoffset = Base::row_map(rowid + 1); const auto rhs_rowid = Base::rget(rowid); - std::cout << "operator() for rowid: " << rowid << std::endl; - typename Base::reftype lhs_rowid = Base::lget(rowid); reduce_item reduce = lhs_rowid; - ReduceFunctor rf {this}; + ReduceFunctor rf {this, rowid}; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, soffset + (IsLower ? 0 : 1), eoffset - (IsLower ? 1 : 0)), rf, reducer(reduce)); @@ -589,57 +596,56 @@ struct SptrsvWrap { // Team-shared buffer. Use for team work. const auto bs = Base::get_block_size(); - std::cout << "Reduction: "; Base::print(reduce, bs); typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); // At end, finalize rowid == colid Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) - std::cout << "lhs: "; Base::print(lhs_rowid); auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); // lhs_rowid /= diag - std::cout << "diag: "; Base::print(diag); Base::divide(team, lhs_rowid, diag, shared_buff.data()); - std::cout << "lhs: "; Base::print(lhs_rowid); } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const member_type &team) const { - /* - auto my_league = team.league_rank(); // map to rowid - auto rowid = Base::nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - auto soffset = Base::row_map(rowid); - auto eoffset = Base::row_map(rowid + 1); - auto rhs_rowid = Base::rget(rowid); + using reduce_item = typename Base::ArrayType; + using reducer = typename Base::SumArray; + + const auto my_league = team.league_rank(); // map to rowid + const auto rowid = Base::nodes_grouped_by_level(my_league + node_count); + const auto soffset = Base::row_map(rowid); + const auto eoffset = Base::row_map(rowid + 1); + const auto rhs_rowid = Base::rget(rowid); typename Base::reftype lhs_rowid = Base::lget(rowid); + reduce_item reduce = lhs_rowid; + ReduceFunctor rf {this, rowid}; - // Team-shared buffer. Use for team work. - const auto bs = Base::get_block_size(); - typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, soffset, eoffset), + rf, reducer(reduce)); + team.team_barrier(); - auto diag = -1; + Base::copy(team, lhs_rowid, reduce); + // Find diag ptr + size_type diag = 0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, typename Base::reftype tdiff) { - auto colid = Base::entries(ptr); - if (colid != rowid) { - auto val = Base::vget(ptr); - auto lhs_colid = Base::lget(colid); - // tdiff -= val * lhs_colid; - Base::multiply_subtract(val, lhs_colid, tdiff); - } else { - diag = ptr; + [&](const size_type ptr, size_type& diag_inner) { + const size_type colid = Base::entries(ptr); + if (colid == rowid) { + diag_inner = ptr; } }, - lhs_rowid); + Kokkos::Max(diag)); team.team_barrier(); + // Team-shared buffer. Use for team work. + const auto bs = Base::get_block_size(); + typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); + // At end, finalize rowid == colid Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) Base::divide(team, lhs_rowid, Base::vget(diag), shared_buff.data()); - */ } }; @@ -838,22 +844,18 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - /* // Thread-local buffers. Use for Serial (non-team) work scalar_t buff1[Base::BUFF_SIZE]; - std::cout << "JGF TriLvlSchedRPSolverFunctor i=" << i << ", block_size=" << Base::get_block_size() << " " << BlockEnabled << std::endl; - - auto rowid = Base::nodes_grouped_by_level(i); - long soffset = Base::row_map(rowid); - long eoffset = Base::row_map(rowid + 1); - auto rhs_rowid = Base::rhs(rowid); + const auto rowid = Base::nodes_grouped_by_level(i); + const auto soffset = Base::row_map(rowid); + const auto eoffset = Base::row_map(rowid + 1); + const auto rhs_rowid = Base::rget(rowid); typename Base::reftype lhs_rowid = Base::lget(rowid); - for (long ptr = soffset + (IsLower ? 0 : 1); ptr < eoffset - (IsLower ? 1 : 0); ++ptr) { - auto colid = Base::entries(ptr); - KK_KERNEL_ASSERT(colid != rowid); + for (auto ptr = soffset + (IsLower ? 0 : 1); ptr < eoffset - (IsLower ? 1 : 0); ++ptr) { + const auto colid = Base::entries(ptr); auto val = Base::vget(ptr); auto lhs_colid = Base::lget(colid); // lhs_rowid -= val * lhs_colid @@ -863,12 +865,10 @@ struct SptrsvWrap { Base::add(rhs_rowid, lhs_rowid); auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); Base::divide(lhs_rowid, diag, &buff1[0]); - */ } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const lno_t i) const { - /* // Thread-local buffers. Use for Serial (non-team) work scalar_t buff1[Base::BUFF_SIZE]; @@ -879,9 +879,9 @@ struct SptrsvWrap { typename Base::reftype lhs_rowid = Base::lget(rowid); - auto diag = -1; - for (long ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = Base::entries(ptr); + size_type diag = 0; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + const size_type colid = Base::entries(ptr); if (colid != rowid) { auto val = Base::values(ptr); auto lhs_colid = Base::lget(colid); @@ -892,7 +892,6 @@ struct SptrsvWrap { } // end for ptr Base::add(rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) Base::divide(lhs_rowid, Base::vget(diag), &buff1[0]); - */ } }; From 7345bd3239eb14d8693448c7e7cc6a8f8f1f1e91 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sun, 7 Jul 2024 18:29:55 -0600 Subject: [PATCH 22/41] Remove test mangling --- sparse/unit_test/Test_Sparse_sptrsv.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 303e4c13f5..f5c881dbc7 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -688,15 +688,10 @@ struct SptrsvTest { const size_type nrows = row_map.size() - 1; // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2, currently unavailable - for (auto alg : {SPTRSVAlgorithm::SEQLVLSCHD_TP1, SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN}) { + for (auto alg : {SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1}) { KernelHandle kh; kh.create_sptrsv_handle(alg, nrows, is_lower, block_size); - if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - } - sptrsv_symbolic(&kh, row_map, entries); Kokkos::fence(); From 7ca671a01c894994ed7c7b33063f9047fd3a7132 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sun, 7 Jul 2024 18:50:04 -0600 Subject: [PATCH 23/41] Switch over block spiluk precond test to use new block sptrsv --- sparse/src/KokkosSparse_LUPrec.hpp | 79 +++++-------------------- sparse/unit_test/Test_Sparse_spiluk.hpp | 2 +- 2 files changed, 16 insertions(+), 65 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index d687c8dd4f..089213a823 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -44,6 +44,7 @@ template class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: using ScalarType = typename std::remove_const::type; + using size_type = typename CRS::size_type; using EXSP = typename CRS::execution_space; using MEMSP = typename CRS::memory_space; using DEVICE = typename Kokkos::Device; @@ -60,7 +61,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: //! Constructor: template - LUPrec(const CRSArg &L, const CRSArg &U) + LUPrec(const CRSArg &L, const CRSArg &U, const size_type block_size=0) : _L(L), _U(U), _tmp("LUPrec::_tmp", L.numPointRows()), @@ -71,9 +72,9 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { "LUPrec: L.numRows() != U.numRows()"); _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), - true); + true, block_size); _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), - false); + false, block_size); } //! Destructor. @@ -82,66 +83,6 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { _khU.destroy_sptrsv_handle(); } - template < - typename Matrix, - typename std::enable_if::value>::type * = nullptr> - void apply_impl(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { - // tmp = trsv(L, x); //Apply L^inv to x - // y = trsv(U, tmp); //Apply U^inv to tmp - - KK_REQUIRE_MSG(transM[0] == NoTranspose[0], - "LUPrec::apply only supports 'N' for transM"); - - sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); - sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); - - sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); - sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, - _tmp2); - - KokkosBlas::axpby(alpha, _tmp2, beta, Y); - } - - template < - typename Matrix, - typename std::enable_if::value>::type * = nullptr> - void apply_impl(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { - // tmp = trsv(L, x); //Apply L^inv to x - // y = trsv(U, tmp); //Apply U^inv to tmp - - KK_REQUIRE_MSG(transM[0] == NoTranspose[0], - "LUPrec::apply only supports 'N' for transM"); - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - using Layout = Kokkos::LayoutLeft; -#else - using Layout = Kokkos::LayoutRight; -#endif - - // trsv is implemented for MV so we need to convert our views - using UView2d = typename Kokkos::View< - ScalarType **, Layout, DEVICE, - Kokkos::MemoryTraits >; - using UView2dc = typename Kokkos::View< - const ScalarType **, Layout, DEVICE, - Kokkos::MemoryTraits >; - UView2dc X2d(X.data(), X.extent(0), 1); - UView2d Y2d(Y.data(), Y.extent(0), 1), - tmp2d(_tmp.data(), _tmp.extent(0), 1), - tmp22d(_tmp2.data(), _tmp2.extent(0), 1); - - KokkosSparse::trsv("L", "N", "N", _L, X2d, tmp2d); - KokkosSparse::trsv("U", "N", "N", _U, tmp2d, tmp22d); - - KokkosBlas::axpby(alpha, _tmp2, beta, Y); - } - ///// \brief Apply the preconditioner to X, putting the result in Y. ///// ///// \tparam XViewType Input vector, as a 1-D Kokkos::View @@ -158,7 +99,17 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { - apply_impl(X, Y, transM, alpha, beta); + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], + "LUPrec::apply only supports 'N' for transM"); + + sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); + sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); + + sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); + sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, + _tmp2); + + KokkosBlas::axpby(alpha, _tmp2, beta, Y); } //@} diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 9eaf087c9b..3aa4abbf31 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -819,7 +819,7 @@ struct SpilukTest { // Make precond. KokkosSparse::Experimental::LUPrec - myPrec(L, U); + myPrec(L, U, UseBlocks ? block_size : 0); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); From f993bcb3e23ca035209326049c48f57b37a30665 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 8 Jul 2024 15:29:41 -0600 Subject: [PATCH 24/41] More test cleanup --- sparse/unit_test/Test_Sparse_sptrsv.hpp | 571 ++++++------------------ 1 file changed, 141 insertions(+), 430 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index f5c881dbc7..735bd59d6e 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -50,9 +50,6 @@ struct SptrsvTest { using RowMapType = Kokkos::View; using EntriesType = Kokkos::View; using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; - using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; using execution_space = typename device::execution_space; using memory_space = typename device::memory_space; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< @@ -65,6 +62,9 @@ struct SptrsvTest { using range_policy_t = Kokkos::RangePolicy; + static inline constexpr scalar_t ZERO = scalar_t(0); + static inline constexpr scalar_t ONE = scalar_t(1); + static std::vector> get_5x5_ut_ones_fixture() { std::vector> A = {{1.00, 0.00, 1.00, 0.00, 0.00}, {0.00, 1.00, 0.00, 0.00, 1.00}, @@ -123,6 +123,82 @@ struct SptrsvTest { return A; } + static std::tuple + create_crs_lhs_rhs(const std::vector>& fixture) + { + RowMapType row_map; + EntriesType entries; + ValuesType values; + + compress_matrix(row_map, entries, values, fixture); + const auto nrows = row_map.size() - 1; + const auto nnz = values.size(); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + ValuesType lhs("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); + + Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + + return std::make_tuple(triMtx, lhs, rhs); + } + + template + static void basic_check(const SpMatrix& triMtx, const ValuesType& lhs, const ValuesType& rhs, const bool is_lower, const size_type block_size=0) + { + // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2, currently unavailable + std::vector algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1}; + if (block_size == 0) { + // SEQLVLSCHD_TP1CHAIN and SPTRSV_CUSPARSE are not supported for blocks + algs.push_back(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN); +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if (std::is_same::value && + std::is_same::value && + std::is_same::value) { + algs.push_back(SPTRSVAlgorithm::SPTRSV_CUSPARSE); + } +#endif + } + + auto row_map = triMtx.graph.row_map; + auto entries = triMtx.graph.entries; + auto values = triMtx.values; + + const size_type nrows = row_map.size() - 1; + + for (auto alg : algs) { + KernelHandle kh; + kh.create_sptrsv_handle(alg, nrows, is_lower, block_size); + if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + } + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + Kokkos::deep_copy(lhs, ZERO); + + kh.destroy_sptrsv_handle(); + } + } struct ReductionCheck { ValuesType lhs; @@ -134,9 +210,6 @@ struct SptrsvTest { }; static void run_test_sptrsv() { - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - const size_type nrows = 5; const size_type nnz = 10; @@ -163,121 +236,12 @@ struct SptrsvTest { // Upper tri { - RowMapType row_map; - EntriesType entries; - ValuesType values; - - auto fixture = get_5x5_ut_ones_fixture(); - - compress_matrix(row_map, entries, values, fixture); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - - Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - - { - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0) ); - */ - - kh.destroy_sptrsv_handle(); - } - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + const auto [triMtx, lhs, rhs] = create_crs_lhs_rhs(get_5x5_ut_ones_fixture()); - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - kh.destroy_sptrsv_handle(); + basic_check(triMtx, lhs, rhs, false); } -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - kh.destroy_sptrsv_handle(); - } -#endif - #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) const scalar_t FIVE = scalar_t(5); const size_type nnz_sp = 14; @@ -388,119 +352,11 @@ struct SptrsvTest { // Lower tri { - auto fixture = get_5x5_lt_ones_fixture(); - RowMapType row_map; - EntriesType entries; - ValuesType values; - - compress_matrix(row_map, entries, values, fixture); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - - Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - - { - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ( sum, lhs.extent(0) ); - */ - - kh.destroy_sptrsv_handle(); - } - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - kh.destroy_sptrsv_handle(); - } - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); + const auto [triMtx, lhs, rhs] = create_crs_lhs_rhs(get_5x5_lt_ones_fixture()); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - kh.destroy_sptrsv_handle(); + basic_check(triMtx, lhs, rhs, true); } -#endif #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) { @@ -579,7 +435,6 @@ struct SptrsvTest { scalar_t sum = 0.0; Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)), ReductionCheck(X), sum); - EXPECT_EQ(sum, lhs.extent(0)); EXPECT_EQ(sum, X.extent(0)); khL.destroy_sptrsv_handle(); @@ -640,7 +495,6 @@ struct SptrsvTest { scalar_t sum = 0.0; Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)), ReductionCheck(X), sum); - EXPECT_EQ(sum, lhs.extent(0)); EXPECT_EQ(sum, X.extent(0)); khLd.destroy_sptrsv_handle(); @@ -651,62 +505,12 @@ struct SptrsvTest { } static void run_test_sptrsv_blocks_impl(const bool is_lower, const size_type block_size) { - constexpr scalar_t ZERO = scalar_t(0); - constexpr scalar_t ONE = scalar_t(1); - - RowMapType point_row_map; - EntriesType point_entries; - ValuesType point_values; auto fixture = is_lower ? get_6x6_lt_ones_fixture() : get_6x6_ut_ones_fixture(); + const auto [triMtx_crs, lhs, rhs] = create_crs_lhs_rhs(fixture); - compress_matrix(point_row_map, point_entries, point_values, fixture); - - const size_type point_nrows = point_row_map.size() - 1; - const size_type point_nnz = point_values.size(); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", point_nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", point_nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", point_nrows); - - Crs triMtx_crs("triMtx", point_nrows, point_nrows, point_nnz, point_values, point_row_map, point_entries); Bsr triMtx(triMtx_crs, block_size); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - - auto row_map = triMtx.graph.row_map; - auto entries = triMtx.graph.entries; - auto values = triMtx.values; - - const size_type nrows = row_map.size() - 1; - - // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2, currently unavailable - for (auto alg : {SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1}) { - KernelHandle kh; - kh.create_sptrsv_handle(alg, nrows, is_lower, block_size); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - Kokkos::deep_copy(lhs, ZERO); - - kh.destroy_sptrsv_handle(); - } + basic_check(triMtx, lhs, rhs, is_lower, block_size); } static void run_test_sptrsv_blocks() { @@ -716,7 +520,7 @@ struct SptrsvTest { } } - static void run_test_sptrsv_streams(int test_algo, int nstreams) { + static void run_test_sptrsv_streams(SPTRSVAlgorithm test_algo, int nstreams, const bool is_lower) { // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition bool run_streams_test = true; @@ -732,9 +536,6 @@ struct SptrsvTest { #endif if (!run_streams_test) return; - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - const size_type nrows = 5; const size_type nnz = 10; @@ -749,150 +550,65 @@ struct SptrsvTest { std::vector rhs_v(nstreams); std::vector lhs_v(nstreams); - RowMapType_hostmirror hrow_map; - EntriesType_hostmirror hentries; - ValuesType_hostmirror hvalues; - - // Upper tri - { - auto fixture = get_5x5_ut_ones_fixture(); - compress_matrix(hrow_map, hentries, hvalues, fixture); - - for (int i = 0; i < nstreams; i++) { - // Allocate U - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); - - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = false; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); + auto fixture = is_lower ? get_5x5_lt_ones_fixture() : get_5x5_ut_ones_fixture(); + const auto [triMtx, lhs, rhs] = create_crs_lhs_rhs(fixture); - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - EXPECT_EQ(sum, lhs_v[i].extent(0)); - - kh_v[i].destroy_sptrsv_handle(); - } - } - - // Lower tri - { - auto fixture = get_5x5_lt_ones_fixture(); - compress_matrix(hrow_map, hentries, hvalues, fixture); + auto row_map = triMtx.graph.row_map; + auto entries = triMtx.graph.entries; + auto values = triMtx.values; - for (int i = 0; i < nstreams; i++) { - // Allocate L - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); + for (int i = 0; i < nstreams; i++) { + // Allocate + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); + // Copy + Kokkos::deep_copy(row_map_v[i], row_map); + Kokkos::deep_copy(entries_v[i], entries); + Kokkos::deep_copy(values_v[i], values); - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); - Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); + Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = true; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams + // Create handle + kh_v[i] = KernelHandle(); + kh_v[i].create_sptrsv_handle(test_algo, nrows, is_lower); + kh_ptr_v[i] = &kh_v[i]; - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams - for (int i = 0; i < nstreams; i++) instances[i].fence(); + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - EXPECT_EQ(sum, lhs_v[i].extent(0)); + for (int i = 0; i < nstreams; i++) instances[i].fence(); - kh_v[i].destroy_sptrsv_handle(); - } + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + EXPECT_EQ(sum, lhs_v[i].extent(0)); + kh_v[i].destroy_sptrsv_handle(); } } }; @@ -911,25 +627,20 @@ template void test_sptrsv_streams() { using TestStruct = Test::SptrsvTest; - - TestStruct::run_test_sptrsv_streams(0, 1); - TestStruct::run_test_sptrsv_streams(0, 2); - TestStruct::run_test_sptrsv_streams(0, 3); - TestStruct::run_test_sptrsv_streams(0, 4); - TestStruct::run_test_sptrsv_streams(1, 1); - TestStruct::run_test_sptrsv_streams(1, 2); - TestStruct::run_test_sptrsv_streams(1, 3); - TestStruct::run_test_sptrsv_streams(1, 4); - + std::vector algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1}; #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) if (std::is_same::value && std::is_same::value) { - TestStruct::run_test_sptrsv_streams(2, 1); - TestStruct::run_test_sptrsv_streams(2, 2); - TestStruct::run_test_sptrsv_streams(2, 3); - TestStruct::run_test_sptrsv_streams(2, 4); + algs.push_back(SPTRSVAlgorithm::SPTRSV_CUSPARSE); } #endif + + for (auto alg : algs) { + for (int nstreams = 1; nstreams <= 4; ++nstreams) { + TestStruct::run_test_sptrsv_streams(alg, nstreams, true); + TestStruct::run_test_sptrsv_streams(alg, nstreams, false); + } + } } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 616fa9ca1f93d6de73035baf5068f267170fccd0 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 9 Jul 2024 15:38:51 -0600 Subject: [PATCH 25/41] Fixes for GPU warnings --- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 4 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 63 +++++++++---------- sparse/unit_test/Test_Sparse_sptrsv.hpp | 8 +-- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 019a63fcd7..0a4a75933e 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -305,7 +305,6 @@ void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #if (CUDA_VERSION >= 11030) typedef typename KernelHandle::nnz_lno_t idx_type; - typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::scalar_t scalar_type; typedef typename KernelHandle::memory_space memory_space; @@ -474,7 +473,6 @@ void sptrsvcuSPARSE_solve_streams( ) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE using idx_type = typename KernelHandle::nnz_lno_t; - using size_type = typename KernelHandle::size_type; using scalar_type = typename KernelHandle::nnz_scalar_t; using memory_space = typename KernelHandle::HandlePersistentMemorySpace; using sptrsvHandleType = typename KernelHandle::SPTRSVHandleType; @@ -544,6 +542,8 @@ void sptrsvcuSPARSE_solve_streams( } } #else // CUDA_VERSION < 11030 + using size_type = typename KernelHandle::size_type; + const bool is_cuda_space = std::is_same::value || std::is_same::value || diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 6f2303e360..53a1c853e7 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -167,7 +167,6 @@ struct SptrsvWrap { // add. y += x KOKKOS_INLINE_FUNCTION static void add(const member_type &team, const scalar_t& x, scalar_t& y) { - scalar_t orig = y; Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); team.team_barrier(); } @@ -182,7 +181,6 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void divide(const member_type &team, scalar_t &b, const scalar_t &A, scalar_t*) { - scalar_t orig = b; Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); team.team_barrier(); } @@ -197,7 +195,6 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void multiply_subtract(const scalar_t &A, const scalar_t &B, scalar_t &C) { - scalar_t orig = C; C -= A * B; } @@ -266,8 +263,8 @@ struct SptrsvWrap { ArrayType() { init(); } KOKKOS_INLINE_FUNCTION - ArrayType(const ArrayType& rhs) { - for (int i = 0; i < BUFF_SIZE; ++i) m_data[i] = rhs.m_data[i]; + ArrayType(const ArrayType& rhs_) { + for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] = rhs_.m_data[i]; } KOKKOS_INLINE_FUNCTION @@ -275,18 +272,18 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION void init() { - for (int i = 0; i < BUFF_SIZE; ++i) m_data[i] = 0; + for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] = 0; } KOKKOS_INLINE_FUNCTION - ArrayType& operator +=(const ArrayType& rhs) { - for (int i = 0; i < BUFF_SIZE; ++i) m_data[i] += rhs.m_data[i]; + ArrayType& operator +=(const ArrayType& rhs_) { + for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] += rhs_.m_data[i]; return *this; } KOKKOS_INLINE_FUNCTION - ArrayType& operator +=(const values_t& rhs) { - for (int i = 0; i < rhs.size(); ++i) m_data[i] += rhs(i); + ArrayType& operator +=(const values_t& rhs_) { + for (int i = 0; i < rhs_.size(); ++i) m_data[i] += rhs_(i); return *this; } }; @@ -362,27 +359,27 @@ struct SptrsvWrap { } KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const CVector &rhs) const { + void lset(const size_type row, const CVector &rhs_) const { auto lvec = lget(row); - assign(lvec, rhs); + assign(lvec, rhs_); } // assign template - KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs, - const View2 &rhs) { - for (size_type i = 0; i < lhs.size(); ++i) { - lhs.data()[i] = rhs.data()[i]; + KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs_, + const View2 &rhs_) { + for (size_t i = 0; i < lhs_.size(); ++i) { + lhs_.data()[i] = rhs_.data()[i]; } } template KOKKOS_INLINE_FUNCTION static void assign(const member_type &team, - const View1 &lhs, - const View2 &rhs) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs.size()), + const View1 &lhs_, + const View2 &rhs_) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs_.size()), [&](const size_type i) { - lhs.data()[i] = rhs.data()[i]; + lhs_.data()[i] = rhs_.data()[i]; }); } @@ -398,7 +395,7 @@ struct SptrsvWrap { KokkosBlas::serial_axpy(1.0, x, y); } - // divide. b /= A (b = b * rhs^-1) + // divide. b /= A (b = b * A^-1) KOKKOS_INLINE_FUNCTION static void divide(const member_type &team, const Vector &b, const CBlock &A, scalar_t* buff) { @@ -423,7 +420,7 @@ struct SptrsvWrap { KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); } - // serial divide. b /= A (b = b * rhs^-1) + // serial divide. b /= A (b = b * A^-1) KOKKOS_INLINE_FUNCTION static void divide(const Vector &b, const CBlock &A, scalar_t* buff) { // Need a temp block to do LU of A @@ -464,9 +461,9 @@ struct SptrsvWrap { } KOKKOS_INLINE_FUNCTION - static void copy(const member_type &team, const Vector& lhs, ArrayType& rhsa) { - CVector rhs(&rhsa.m_data[0], lhs.size()); - assign(team, lhs, rhs); + static void copy(const member_type &team, const Vector& lhs_, ArrayType& rhsa) { + CVector rhs_(&rhsa.m_data[0], lhs_.size()); + assign(team, lhs_, rhs_); } // lget @@ -511,21 +508,21 @@ struct SptrsvWrap { } KOKKOS_INLINE_FUNCTION - static void print(const ArrayType& rhs, const int block_size) + static void print(const ArrayType& rhs_, const int block_size) { std::cout << "Array: "; for (int i = 0; i < block_size; ++i) { - std::cout << rhs.m_data[i] << " "; + std::cout << rhs_.m_data[i] << " "; } std::cout << std::endl; } KOKKOS_INLINE_FUNCTION - static void print(const SumArray& rhs, const int block_size) + static void print(const SumArray& rhs_, const int block_size) { std::cout << "SumArray: "; for (int i = 0; i < block_size; ++i) { - std::cout << rhs.reference().m_data[i] << " "; + std::cout << rhs_.reference().m_data[i] << " "; } std::cout << std::endl; } @@ -556,14 +553,14 @@ struct SptrsvWrap { struct ReduceFunctor { const Base* m_obj; - size_type rowid; + lno_t rowid; using accum_t = std::conditional_t; KOKKOS_INLINE_FUNCTION void operator()(size_type i, accum_t& accum) const { - const size_type colid = m_obj->entries(i); + const auto colid = m_obj->entries(i); if (!AvoidDiag || colid != rowid) { auto val = m_obj->vget(i); auto lhs_colid = m_obj->lget(colid); @@ -631,7 +628,7 @@ struct SptrsvWrap { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, soffset, eoffset), [&](const size_type ptr, size_type& diag_inner) { - const size_type colid = Base::entries(ptr); + const auto colid = Base::entries(ptr); if (colid == rowid) { diag_inner = ptr; } @@ -881,7 +878,7 @@ struct SptrsvWrap { size_type diag = 0; for (auto ptr = soffset; ptr < eoffset; ++ptr) { - const size_type colid = Base::entries(ptr); + const auto colid = Base::entries(ptr); if (colid != rowid) { auto val = Base::values(ptr); auto lhs_colid = Base::lget(colid); diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 735bd59d6e..35e7d57572 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -62,8 +62,8 @@ struct SptrsvTest { using range_policy_t = Kokkos::RangePolicy; - static inline constexpr scalar_t ZERO = scalar_t(0); - static inline constexpr scalar_t ONE = scalar_t(1); + static inline const scalar_t ZERO = scalar_t(0); + static inline const scalar_t ONE = scalar_t(1); static std::vector> get_5x5_ut_ones_fixture() { std::vector> A = {{1.00, 0.00, 1.00, 0.00, 0.00}, @@ -211,7 +211,6 @@ struct SptrsvTest { static void run_test_sptrsv() { const size_type nrows = 5; - const size_type nnz = 10; #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using host_crsmat_t = @@ -580,9 +579,6 @@ struct SptrsvTest { // A*known_lhs generates rhs: rhs is dense, use spmv rhs_v[i] = ValuesType("rhs", nrows); - Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); Kokkos::fence(); From 55f8a1a43f9f9dd9b10d625682ff8e62625dbbd1 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 17 Jul 2024 11:33:37 -0600 Subject: [PATCH 26/41] Conflicts resolved, still work needed --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 539 ++++++++++++++---- 1 file changed, 431 insertions(+), 108 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index bc31f14791..66b2632b38 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -28,12 +28,18 @@ #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV // Enable supernodal sptrsv -#include "KokkosBlas3_trsm.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosBatched_Util.hpp" #include "KokkosBlas2_team_gemv_spec.hpp" -#include "KokkosBatched_Trsm_Team_Impl.hpp" #endif +#include "KokkosBlas3_trsm.hpp" +#include "KokkosBatched_Trsv_Decl.hpp" +#include "KokkosBatched_Trsm_Team_Impl.hpp" +#include "KokkosBlas1_team_axpby.hpp" +#include "KokkosBlas1_axpby.hpp" +#include "KokkosBlas1_set.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBlas2_serial_gemv_impl.hpp" #define KOKKOSKERNELS_SPTRSV_TRILVLSCHED @@ -99,7 +105,9 @@ struct SptrsvWrap { }; /** - * Common base class for sptrsv functors + * Common base class for sptrsv functors that need to work for both + * point and block matrices. Default version does not support + * blocks */ template @@ -111,6 +119,22 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; + using reftype = scalar_t &; + using ArrayType = reftype; + using SumArray = reftype; + + struct SBlock { + template + KOKKOS_INLINE_FUNCTION SBlock(T, size_type, size_type) {} + + KOKKOS_INLINE_FUNCTION + scalar_t *data() { return nullptr; } + + static int shmem_size(size_type, size_type) { return 0; } + }; + + static constexpr size_type BUFF_SIZE = 1; + Common(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, @@ -121,10 +145,82 @@ struct SptrsvWrap { lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_) { - KK_REQUIRE_MSG(!BlockEnabled, "Blocks are not yet supported."); - KK_REQUIRE_MSG(block_size_ == 0, "Blocks are not yet supported."); + KK_REQUIRE_MSG(block_size_ == 0, + "Tried to use blocks with the unblocked Common?"); + } + + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return 0; } + + // lset + KOKKOS_INLINE_FUNCTION + void lset(const size_type row, const scalar_t value) const { + lhs(row) = value; + } + + // add. y += x + KOKKOS_INLINE_FUNCTION + static void add(const member_type &team, const scalar_t& x, scalar_t& y) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); + team.team_barrier(); + } + + // serial add. y += x + KOKKOS_INLINE_FUNCTION + static void add(const scalar_t& x, scalar_t& y) { + y += x; + } + + // divide. b /= A + KOKKOS_INLINE_FUNCTION + static void divide(const member_type &team, scalar_t &b, const scalar_t &A, + scalar_t*) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); + team.team_barrier(); } + // serial divide. b /= A + KOKKOS_INLINE_FUNCTION + static void divide(scalar_t &b, const scalar_t &A, scalar_t*) { + b /= A; + } + + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const scalar_t &A, const scalar_t &B, + scalar_t &C) { + C -= A * B; + } + + KOKKOS_INLINE_FUNCTION + static void copy(const member_type&, scalar_t&, const scalar_t&) {} + + // lget + KOKKOS_INLINE_FUNCTION + scalar_t& lget(const size_type row) const { return lhs(row); } + + // rget + KOKKOS_INLINE_FUNCTION + scalar_t rget(const size_type row) const { return rhs(row); } + + // vget + KOKKOS_INLINE_FUNCTION + scalar_t vget(const size_type nnz) const { return values(nnz); } + + // lhs = (lhs + rhs) / diag + KOKKOS_INLINE_FUNCTION + static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, + const scalar_t &diag_val) { + lhs_val = (lhs_val + rhs_val) / diag_val; + } + + // print + KOKKOS_INLINE_FUNCTION + static void print(const scalar_t &item) { std::cout << item << std::endl; } + + KOKKOS_INLINE_FUNCTION + static void print(ArrayType rhs, const int) { std::cout << rhs << std::endl; } + struct ReduceSumFunctor { const Common *m_obj; const lno_t rowid; @@ -158,12 +254,6 @@ struct SptrsvWrap { } }; - KOKKOS_INLINE_FUNCTION - static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, - const scalar_t &diag_val) { - lhs_val = (lhs_val + rhs_val) / diag_val; - } - template KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, @@ -243,6 +333,317 @@ struct SptrsvWrap { } }; + // Partial specialization for block support + template + struct Common { + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + + using Block = Kokkos::View< + scalar_t **, Layout, typename ValuesType::device_type, + Kokkos::MemoryTraits >; + + // const block + using CBlock = Kokkos::View< + const scalar_t **, Layout, typename ValuesType::device_type, + Kokkos::MemoryTraits >; + + // scratch block + using SBlock = Kokkos::View< + scalar_t **, Layout, typename execution_space::scratch_memory_space, + Kokkos::MemoryTraits >; + + using Vector = Kokkos::View< + scalar_t *, Layout, typename ValuesType::device_type, + Kokkos::MemoryTraits >; + + using CVector = Kokkos::View< + const scalar_t *, Layout, typename ValuesType::device_type, + Kokkos::MemoryTraits >; + + static constexpr size_type BUFF_SIZE = 128; + + using reftype = Vector; + + struct ArrayType + { + scalar_t m_data[BUFF_SIZE]; + + KOKKOS_INLINE_FUNCTION + ArrayType() { init(); } + + KOKKOS_INLINE_FUNCTION + ArrayType(const ArrayType& rhs_) { + for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] = rhs_.m_data[i]; + } + + KOKKOS_INLINE_FUNCTION + ArrayType(const Vector&) { init(); } + + KOKKOS_INLINE_FUNCTION + void init() { + for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] = 0; + } + + KOKKOS_INLINE_FUNCTION + ArrayType& operator +=(const ArrayType& rhs_) { + for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] += rhs_.m_data[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION + ArrayType& operator +=(const values_t& rhs_) { + for (int i = 0; i < rhs_.size(); ++i) m_data[i] += rhs_(i); + return *this; + } + }; + + struct SumArray + { + using reducer = SumArray; + using value_type = ArrayType; + using result_view_type = Kokkos::View; + + private: + value_type& m_value; + + public: + KOKKOS_INLINE_FUNCTION + SumArray(value_type& value) : m_value(value) {} + + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest += src; + } + + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { val.init(); } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return m_value; } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return result_view_type(&m_value, 1); } + + KOKKOS_INLINE_FUNCTION + bool reference_scalar() const { return true; } + }; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + size_type block_size; + size_type block_items; + + Common(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, + LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const size_type block_size_) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + block_size(block_size_), + block_items(block_size * block_size) + { + KK_REQUIRE_MSG(block_size > 0, + "Tried to use block_size=0 with the blocked Common?"); + KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); + } + + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + + // lset + KOKKOS_INLINE_FUNCTION + void lset(const size_type row, const scalar_t &value) const { + KokkosBlas::SerialSet::invoke(value, lget(row)); + } + + KOKKOS_INLINE_FUNCTION + void lset(const size_type row, const CVector &rhs_) const { + auto lvec = lget(row); + assign(lvec, rhs_); + } + + // assign + template + KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs_, + const View2 &rhs_) { + for (size_t i = 0; i < lhs_.size(); ++i) { + lhs_.data()[i] = rhs_.data()[i]; + } + } + + template + KOKKOS_INLINE_FUNCTION static void assign(const member_type &team, + const View1 &lhs_, + const View2 &rhs_) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs_.size()), + [&](const size_type i) { + lhs_.data()[i] = rhs_.data()[i]; + }); + } + + // add. y += x + KOKKOS_INLINE_FUNCTION + static void add(const member_type &team, const CVector& x, const Vector& y) { + KokkosBlas::Experimental::axpy(team, 1.0, x, y); + } + + // serial add. y += x + KOKKOS_INLINE_FUNCTION + static void add(const CVector& x, const Vector& y) { + KokkosBlas::serial_axpy(1.0, x, y); + } + + // divide. b /= A (b = b * A^-1) + KOKKOS_INLINE_FUNCTION + static void divide(const member_type &team, const Vector &b, const CBlock &A, + scalar_t* buff) { + // Need a temp block to do LU of A + const auto block_size = b.size(); + Block LU(buff, block_size, block_size); + assign(team, LU, A); + KokkosBatched::TeamLU::invoke(team, LU); + + // A = LU + // A^-1 = U^-1 * L^-1 + // b = (b * U^-1) * L^-1, so do U trsv first + KokkosBatched::TeamTrsv< + member_type, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); + + KokkosBatched::TeamTrsv< + member_type, KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); + } + + // serial divide. b /= A (b = b * A^-1) + KOKKOS_INLINE_FUNCTION + static void divide(const Vector &b, const CBlock &A, scalar_t* buff) { + // Need a temp block to do LU of A + const auto block_size = b.size(); + Block LU(buff, block_size, block_size); + assign(LU, A); + KokkosBatched::SerialLU::invoke(LU); + + // A = LU + // A^-1 = U^-1 * L^-1 + // b = (b * U^-1) * L^-1, so do U trsv first + KokkosBatched::SerialTrsv< + KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); + + KokkosBatched::SerialTrsv< + KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); + } + + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const CBlock &A, const CVector &B, + ArrayType &Ca) { + Vector C(&Ca.m_data[0], B.size()); + multiply_subtract(A, B, C); + } + + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const CBlock &A, const CVector &B, + Vector &C) { + // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 + KokkosBlas::SerialGemv< + KokkosBlas::Trans::NoTranspose, KokkosBlas::Algo::Gemv::Blocked>:: + invoke(-1.0, A, B, 1.0, C); + } + + KOKKOS_INLINE_FUNCTION + static void copy(const member_type &team, const Vector& lhs_, ArrayType& rhsa) { + CVector rhs_(&rhsa.m_data[0], lhs_.size()); + assign(team, lhs_, rhs_); + } + + // lget + KOKKOS_INLINE_FUNCTION + Vector lget(const size_type row) const { + return Vector(lhs.data() + (row * block_size), block_size); + } + + // rget + KOKKOS_INLINE_FUNCTION + CVector rget(const size_type row) const { + return CVector(rhs.data() + (row * block_size), block_size); + } + + // vget + KOKKOS_INLINE_FUNCTION + CBlock vget(const size_type block) const { + return CBlock(values.data() + (block * block_items), block_size, block_size); + } + + // print + KOKKOS_INLINE_FUNCTION + static void print(const CBlock &item) { + std::cout << "Block: "; + for (size_type i = 0; i < item.extent(0); ++i) { + std::cout << " "; + for (size_type j = 0; j < item.extent(1); ++j) { + std::cout << item(i, j) << " "; + } + std::cout << std::endl; + } + } + + // print + KOKKOS_INLINE_FUNCTION + static void print(const CVector &item) { + std::cout << "Vector: "; + for (size_type i = 0; i < item.extent(0); ++i) { + std::cout << item(i) << " "; + } + std::cout << std::endl; + } + + KOKKOS_INLINE_FUNCTION + static void print(const ArrayType& rhs_, const int block_size) + { + std::cout << "Array: "; + for (int i = 0; i < block_size; ++i) { + std::cout << rhs_.m_data[i] << " "; + } + std::cout << std::endl; + } + + KOKKOS_INLINE_FUNCTION + static void print(const SumArray& rhs_, const int block_size) + { + std::cout << "SumArray: "; + for (int i = 0; i < block_size; ++i) { + std::cout << rhs_.reference().m_data[i] << " "; + } + std::cout << std::endl; + } + }; + + // + // TriLvlSched functors + // + template struct TriLvlSchedTP1SolverFunctor @@ -278,8 +679,6 @@ struct SptrsvWrap { } }; - // Lower vs Upper Multi-block Functors - template struct TriLvlSchedRPSolverFunctor @@ -388,6 +787,10 @@ struct SptrsvWrap { } }; + // + // Supernodal functors + // + #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) // ----------------------------------------------------------- // Helper functors for Lower-triangular solve with SpMV @@ -1074,7 +1477,7 @@ struct SptrsvWrap { Functor - template static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, @@ -1092,14 +1495,14 @@ struct SptrsvWrap { const auto hnodes_per_level = thandle.get_host_nodes_per_level(); const auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); const auto block_size = thandle.get_block_size(); - const auto block_enabled = false; // thandle.is_block_enabled(); - assert(block_size == 0); + const auto block_enabled = thandle.is_block_enabled(); + assert(block_enabled == BlockEnabled); // Set up functor types using LowerRPFunc = - FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, false); + FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, BlockEnabled); using LowerTPFunc = - FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, false); + FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, BlockEnabled); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -1183,53 +1586,14 @@ struct SptrsvWrap { auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + const int scratch_size = LowerTPFunc::SBlock::shmem_size(block_size, block_size); + tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_l_team", Kokkos::Experimental::require( tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), ltpp); } - // TP2 algorithm has issues with some offset-ordinal combo to be - // addressed - /* - else if ( thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef - Kokkos::TeamPolicy tvt_policy_type; - - int team_size = thandle.get_team_size(); - if ( team_size == -1 ) { - team_size = std::is_same< typename - Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : - 64; - } - int vector_size = thandle.get_team_size(); - if ( vector_size == -1 ) { - vector_size = std::is_same< typename - Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : - 4; - } - - // This impl: "chunk" lvl_nodes into node_groups; a league_rank is - responsible for processing team_size # nodes - // TeamThreadRange over number nodes of node_groups - // To avoid masking threads, 1 thread (team) per node in - node_group (thread has full ownership of a node) - // ThreadVectorRange responsible for the actual solve - computation - //const int node_groups = team_size; - const int node_groups = vector_size; - - #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, true, node_count, vector_size, 0); #else - LowerTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, node_count, node_groups); #endif - Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( - (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size - ), tstf); } // end elseif - */ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || @@ -1470,7 +1834,7 @@ struct SptrsvWrap { #endif } // end lower_tri_solve - template static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, @@ -1490,14 +1854,14 @@ struct SptrsvWrap { auto hnodes_per_level = thandle.get_host_nodes_per_level(); auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); const auto block_size = thandle.get_block_size(); - const auto block_enabled = false; // thandle.is_block_enabled(); - assert(block_size == 0); + const auto block_enabled = thandle.is_block_enabled(); + assert(block_size == BlockEnabled); // Set up functor types using UpperRPFunc = - FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, false); + FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, BlockEnabled); using UpperTPFunc = - FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, false); + FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, BlockEnabled); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -1580,52 +1944,14 @@ struct SptrsvWrap { auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + const int scratch_size = UpperTPFunc::SBlock::shmem_size(block_size, block_size); + tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_u_team", Kokkos::Experimental::require( tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), utpp); } - // TP2 algorithm has issues with some offset-ordinal combo to be - // addressed - /* - else if ( thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { -typedef Kokkos::TeamPolicy tvt_policy_type; - - int team_size = thandle.get_team_size(); - if ( team_size == -1 ) { - team_size = std::is_same< typename - Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace ->::value ? 1 : 64; - } - int vector_size = thandle.get_team_size(); - if ( vector_size == -1 ) { - vector_size = std::is_same< typename -Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 4; - } - - // This impl: "chunk" lvl_nodes into node_groups; a league_rank is -responsible for processing that many nodes - // TeamThreadRange over number nodes of node_groups - // To avoid masking threads, 1 thread (team) per node in -node_group (thread has full ownership of a node) - // ThreadVectorRange responsible for the actual solve computation - //const int node_groups = team_size; - const int node_groups = vector_size; - -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, -nodes_grouped_by_level, false, node_count, vector_size, 0); #else - UpperTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, -nodes_grouped_by_level, node_count, node_groups); #endif - - Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( -(int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), -tstf); } // end elseif - */ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || @@ -2118,10 +2444,7 @@ tstf); } // end elseif Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? + node_count += lvl_nodes; } // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the From a0d7f5ede9f57638290de5178a68556c8c64d687 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 17 Jul 2024 12:41:05 -0600 Subject: [PATCH 27/41] builds --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 331 +++++++++++------- 1 file changed, 199 insertions(+), 132 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 66b2632b38..5c4643c3f2 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -133,8 +133,6 @@ struct SptrsvWrap { static int shmem_size(size_type, size_type) { return 0; } }; - static constexpr size_type BUFF_SIZE = 1; - Common(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, @@ -195,6 +193,9 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void copy(const member_type&, scalar_t&, const scalar_t&) {} + KOKKOS_INLINE_FUNCTION + static void copy(scalar_t&, const scalar_t&) {} + // lget KOKKOS_INLINE_FUNCTION scalar_t& lget(const size_type row) const { return lhs(row); } @@ -207,130 +208,29 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION scalar_t vget(const size_type nnz) const { return values(nnz); } - // lhs = (lhs + rhs) / diag + // lhs = (lhs + rhs) / diag (team) + KOKKOS_INLINE_FUNCTION + static void add_and_divide(const member_type& team, scalar_t &lhs_val, const scalar_t &rhs_val, + const scalar_t &diag_val) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + lhs_val = (lhs_val + rhs_val) / diag_val; + }); + } + + // lhs = (lhs + rhs) / diag (serial) KOKKOS_INLINE_FUNCTION static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, const scalar_t &diag_val) { lhs_val = (lhs_val + rhs_val) / diag_val; } + // print KOKKOS_INLINE_FUNCTION static void print(const scalar_t &item) { std::cout << item << std::endl; } KOKKOS_INLINE_FUNCTION static void print(ArrayType rhs, const int) { std::cout << rhs << std::endl; } - - struct ReduceSumFunctor { - const Common *m_obj; - const lno_t rowid; - lno_t diag; - - KOKKOS_INLINE_FUNCTION - void operator()(size_type i, scalar_t &accum) const { - const auto colid = m_obj->entries(i); - auto val = m_obj->values(i); - auto lhs_colid = m_obj->lhs(colid); - accum -= val * lhs_colid; - KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); - } - }; - - struct ReduceSumDiagFunctor { - const Common *m_obj; - const lno_t rowid; - lno_t diag; - - KOKKOS_INLINE_FUNCTION - void operator()(size_type i, scalar_t &accum) const { - const auto colid = m_obj->entries(i); - if (colid != rowid) { - auto val = m_obj->values(i); - auto lhs_colid = m_obj->lhs(colid); - accum -= val * lhs_colid; - } else { - diag = i; - } - } - }; - - template - KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, - const int my_rank, - const long node_count) const { - static_assert( - !((!IsSerial && BlockEnabled) && UseThreadVec), - "ThreadVectorRanges are not yet supported for block-enabled"); - static_assert(!(IsSerial && UseThreadVec), - "Requested thread vector range in serial?"); - - const auto rowid = nodes_grouped_by_level(my_rank + node_count); - const auto soffset = row_map(rowid); - const auto eoffset = row_map(rowid + 1); - const auto rhs_val = rhs(rowid); - scalar_t &lhs_val = lhs(rowid); - - // Set up range to auto-skip diag if is sorted - const auto itr_b = soffset + (IsSorted ? (IsLower ? 0 : 1) : 0); - const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0); - - // We don't need the reducer to find the diag item if sorted - using reducer_t = - std::conditional_t; - reducer_t rf{this, rowid, -1}; - - if constexpr (IsSerial) { - KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); - KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); - for (auto ptr = itr_b; ptr < itr_e; ++ptr) { - rf(ptr, lhs_val); - } - } else { - KK_KERNEL_ASSERT_MSG(team != nullptr, - "Cannot do team operations without team"); - if constexpr (!UseThreadVec) { - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), - rf, lhs_val); - team->team_barrier(); - } else { - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); - } - } - - // If sorted, we already know the diag. Otherwise, get it from the reducer - rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag; - - // At end, handle the diag element. We need to be careful to avoid race - // conditions here. - if constexpr (IsSerial) { - // Serial case is easy, there's only 1 thread so just do the - // add_and_divide - KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag"); - add_and_divide(lhs_val, rhs_val, values(rf.diag)); - } else { - if constexpr (IsSorted) { - // Parallel sorted case is complex. All threads know what the diag is. - // If we have a team sharing the work, we need to ensure only one - // thread performs the add_and_divide. - KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag"); - if constexpr (!UseThreadVec) { - Kokkos::single(Kokkos::PerTeam(*team), [&]() { - add_and_divide(lhs_val, rhs_val, values(rf.diag)); - }); - } else { - add_and_divide(lhs_val, rhs_val, values(rf.diag)); - } - } else { - // Parallel unsorted case. Only one thread should know what the diag - // item is. We have that one do the add_and_divide. - if (rf.diag != -1) { - add_and_divide(lhs_val, rhs_val, values(rf.diag)); - } - } - } - } }; // Partial specialization for block support @@ -362,20 +262,21 @@ struct SptrsvWrap { const scalar_t *, Layout, typename ValuesType::device_type, Kokkos::MemoryTraits >; + static constexpr size_type MAX_VEC_SIZE = 11; static constexpr size_type BUFF_SIZE = 128; using reftype = Vector; struct ArrayType { - scalar_t m_data[BUFF_SIZE]; + scalar_t m_data[MAX_VEC_SIZE]; KOKKOS_INLINE_FUNCTION ArrayType() { init(); } KOKKOS_INLINE_FUNCTION ArrayType(const ArrayType& rhs_) { - for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] = rhs_.m_data[i]; + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = rhs_.m_data[i]; } KOKKOS_INLINE_FUNCTION @@ -383,12 +284,12 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION void init() { - for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] = 0; + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = 0; } KOKKOS_INLINE_FUNCTION ArrayType& operator +=(const ArrayType& rhs_) { - for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] += rhs_.m_data[i]; + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] += rhs_.m_data[i]; return *this; } @@ -458,7 +359,7 @@ struct SptrsvWrap { { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); - KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); + KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, "Max supported block size is " << MAX_VEC_SIZE); } KOKKOS_INLINE_FUNCTION @@ -509,11 +410,13 @@ struct SptrsvWrap { // divide. b /= A (b = b * A^-1) KOKKOS_INLINE_FUNCTION - static void divide(const member_type &team, const Vector &b, const CBlock &A, - scalar_t* buff) { - // Need a temp block to do LU of A + static void divide(const member_type &team, const Vector &b, const CBlock &A) { + // Team-shared buffer. Use for team work. const auto block_size = b.size(); - Block LU(buff, block_size, block_size); + SBlock shared_buff(team.team_shmem(), block_size, block_size); + + // Need a temp block to do LU of A + Block LU(shared_buff.data(), block_size, block_size); assign(team, LU, A); KokkosBatched::TeamLU::invoke(team, LU); @@ -534,10 +437,13 @@ struct SptrsvWrap { // serial divide. b /= A (b = b * A^-1) KOKKOS_INLINE_FUNCTION - static void divide(const Vector &b, const CBlock &A, scalar_t* buff) { + static void divide(const Vector &b, const CBlock &A) { + // Thread-local buffers. Use for Serial (non-team) work + scalar_t buff[BUFF_SIZE]; + // Need a temp block to do LU of A const auto block_size = b.size(); - Block LU(buff, block_size, block_size); + Block LU(&buff[0], block_size, block_size); assign(LU, A); KokkosBatched::SerialLU::invoke(LU); @@ -578,6 +484,12 @@ struct SptrsvWrap { assign(team, lhs_, rhs_); } + KOKKOS_INLINE_FUNCTION + static void copy(const Vector& lhs_, ArrayType& rhsa) { + CVector rhs_(&rhsa.m_data[0], lhs_.size()); + assign(lhs_, rhs_); + } + // lget KOKKOS_INLINE_FUNCTION Vector lget(const size_type row) const { @@ -596,6 +508,22 @@ struct SptrsvWrap { return CBlock(values.data() + (block * block_items), block_size, block_size); } + // lhs = (lhs + rhs) / diag + KOKKOS_INLINE_FUNCTION + static void add_and_divide(const member_type& team, const Vector &lhs_val, const CVector &rhs_val, + const CBlock &diag_val) { + add(team, rhs_val, lhs_val); + divide(team, lhs_val, diag_val); + } + + KOKKOS_INLINE_FUNCTION + static void add_and_divide(const Vector &lhs_val, const CVector &rhs_val, + const CBlock &diag_val) { + add(rhs_val, lhs_val); + divide(lhs_val, diag_val); + } + + // print KOKKOS_INLINE_FUNCTION static void print(const CBlock &item) { @@ -640,6 +568,145 @@ struct SptrsvWrap { } }; + /** + * Intermediate class that contains implementation that identical + * for blocked / non-blocked + */ + template + struct Intermediate : + public Common { + using Base = Common; + using accum_t = std::conditional_t; + + Intermediate(const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const size_type block_size_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, + block_size_) {} + + struct ReduceSumFunctor { + const Base *m_obj; + const lno_t rowid; + lno_t diag; + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, accum_t &accum) const { + const auto colid = m_obj->entries(i); + auto val = m_obj->vget(i); + auto lhs_colid = m_obj->lget(colid); + //accum -= val * lhs_colid; + Base::multiply_subtract(val, lhs_colid, accum); + KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); + } + }; + + struct ReduceSumDiagFunctor { + const Base *m_obj; + const lno_t rowid; + lno_t diag; + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, accum_t &accum) const { + const auto colid = m_obj->entries(i); + if (colid != rowid) { + auto val = m_obj->vget(i); + auto lhs_colid = m_obj->lget(colid); + //accum -= val * lhs_colid; + Base::multiply_subtract(val, lhs_colid, accum); + } else { + diag = i; + } + } + }; + + template + KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, + const int my_rank, + const long node_count) const { + using reduce_item_t = typename Base::ArrayType; + using reducer_t = typename Base::SumArray; + using functor_t = std::conditional_t; + + static_assert( + !((!IsSerial && BlockEnabled) && UseThreadVec), + "ThreadVectorRanges are not yet supported for block-enabled"); + static_assert(!(IsSerial && UseThreadVec), + "Requested thread vector range in serial?"); + + const auto rowid = Base::nodes_grouped_by_level(my_rank + node_count); + const auto soffset = Base::row_map(rowid); + const auto eoffset = Base::row_map(rowid + 1); + const auto rhs_val = Base::rget(rowid); + + // Set up range to auto-skip diag if is sorted + const auto itr_b = soffset + (IsSorted ? (IsLower ? 0 : 1) : 0); + const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0); + + // We don't need the reducer to find the diag item if sorted + functor_t rf{this, rowid, -1}; + typename Base::reftype lhs_val = Base::lget(rowid); + reduce_item_t reduce = lhs_val; + + if constexpr (IsSerial) { + KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); + KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); + for (auto ptr = itr_b; ptr < itr_e; ++ptr) { + rf(ptr, reduce); + } + Base::copy(lhs_val, reduce); + } else { + KK_KERNEL_ASSERT_MSG(team != nullptr, + "Cannot do team operations without team"); + if constexpr (!UseThreadVec) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), + rf, reducer_t(reduce)); + team->team_barrier(); + Base::copy(*team, lhs_val, reduce); + } else { + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); + Base::copy(lhs_val, reduce); + } + } + + // If sorted, we already know the diag. Otherwise, get it from the reducer + rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag; + + // At end, handle the diag element. We need to be careful to avoid race + // conditions here. + if constexpr (IsSerial) { + // Serial case is easy, there's only 1 thread so just do the + // add_and_divide + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag"); + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); + } else { + if constexpr (IsSorted) { + // Parallel sorted case is complex. All threads know what the diag is. + // If we have a team sharing the work, we need to ensure only one + // thread performs the add_and_divide (except in BlockEnabled, then + // we can use team operations). + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag"); + if constexpr (!UseThreadVec) { + Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(rf.diag)); + } else { + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); + } + } else { + // Parallel unsorted case. Only one thread should know what the diag + // item is. We have that one do the add_and_divide. + if (rf.diag != -1) { + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); + } + } + } + } + }; + // // TriLvlSched functors // @@ -647,9 +714,9 @@ struct SptrsvWrap { template struct TriLvlSchedTP1SolverFunctor - : public Common { - using Base = Common; long node_count; // like "block" offset into ngbl, my_league is the "local" @@ -682,9 +749,9 @@ struct SptrsvWrap { template struct TriLvlSchedRPSolverFunctor - : public Common { - using Base = Common; TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, @@ -710,10 +777,10 @@ struct SptrsvWrap { template struct TriLvlSchedTP1SingleBlockFunctor - : public Common { using Base = - Common; + Intermediate; entries_t nodes_per_level; @@ -1855,7 +1922,7 @@ struct SptrsvWrap { auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); const auto block_size = thandle.get_block_size(); const auto block_enabled = thandle.is_block_enabled(); - assert(block_size == BlockEnabled); + assert(block_enabled == BlockEnabled); // Set up functor types using UpperRPFunc = From 92eca0f64a49b6cc6e39de2e30913c9ebdc4423e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 18 Jul 2024 10:44:16 -0600 Subject: [PATCH 28/41] Formatting --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 226 +++++++++--------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 24 +- sparse/src/KokkosSparse_LUPrec.hpp | 2 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 2 +- sparse/unit_test/Test_Sparse_sptrsv.hpp | 33 +-- 5 files changed, 144 insertions(+), 143 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 5c4643c3f2..53c584407c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -119,9 +119,9 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - using reftype = scalar_t &; + using reftype = scalar_t &; using ArrayType = reftype; - using SumArray = reftype; + using SumArray = reftype; struct SBlock { template @@ -158,47 +158,43 @@ struct SptrsvWrap { // add. y += x KOKKOS_INLINE_FUNCTION - static void add(const member_type &team, const scalar_t& x, scalar_t& y) { + static void add(const member_type &team, const scalar_t &x, scalar_t &y) { Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); team.team_barrier(); } // serial add. y += x KOKKOS_INLINE_FUNCTION - static void add(const scalar_t& x, scalar_t& y) { - y += x; - } + static void add(const scalar_t &x, scalar_t &y) { y += x; } // divide. b /= A KOKKOS_INLINE_FUNCTION static void divide(const member_type &team, scalar_t &b, const scalar_t &A, - scalar_t*) { + scalar_t *) { Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); team.team_barrier(); } // serial divide. b /= A KOKKOS_INLINE_FUNCTION - static void divide(scalar_t &b, const scalar_t &A, scalar_t*) { - b /= A; - } + static void divide(scalar_t &b, const scalar_t &A, scalar_t *) { b /= A; } // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION static void multiply_subtract(const scalar_t &A, const scalar_t &B, - scalar_t &C) { + scalar_t &C) { C -= A * B; } KOKKOS_INLINE_FUNCTION - static void copy(const member_type&, scalar_t&, const scalar_t&) {} + static void copy(const member_type &, scalar_t &, const scalar_t &) {} KOKKOS_INLINE_FUNCTION - static void copy(scalar_t&, const scalar_t&) {} + static void copy(scalar_t &, const scalar_t &) {} // lget KOKKOS_INLINE_FUNCTION - scalar_t& lget(const size_type row) const { return lhs(row); } + scalar_t &lget(const size_type row) const { return lhs(row); } // rget KOKKOS_INLINE_FUNCTION @@ -210,11 +206,11 @@ struct SptrsvWrap { // lhs = (lhs + rhs) / diag (team) KOKKOS_INLINE_FUNCTION - static void add_and_divide(const member_type& team, scalar_t &lhs_val, const scalar_t &rhs_val, + static void add_and_divide(const member_type &team, scalar_t &lhs_val, + const scalar_t &rhs_val, const scalar_t &diag_val) { - Kokkos::single(Kokkos::PerTeam(team), [&]() { - lhs_val = (lhs_val + rhs_val) / diag_val; - }); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { lhs_val = (lhs_val + rhs_val) / diag_val; }); } // lhs = (lhs + rhs) / diag (serial) @@ -224,13 +220,14 @@ struct SptrsvWrap { lhs_val = (lhs_val + rhs_val) / diag_val; } - // print KOKKOS_INLINE_FUNCTION static void print(const scalar_t &item) { std::cout << item << std::endl; } KOKKOS_INLINE_FUNCTION - static void print(ArrayType rhs, const int) { std::cout << rhs << std::endl; } + static void print(ArrayType rhs, const int) { + std::cout << rhs << std::endl; + } }; // Partial specialization for block support @@ -242,45 +239,44 @@ struct SptrsvWrap { using Block = Kokkos::View< scalar_t **, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; // const block using CBlock = Kokkos::View< const scalar_t **, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; // scratch block using SBlock = Kokkos::View< scalar_t **, Layout, typename execution_space::scratch_memory_space, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using Vector = Kokkos::View< - scalar_t *, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits >; + scalar_t *, Layout, typename ValuesType::device_type, + Kokkos::MemoryTraits>; using CVector = Kokkos::View< - const scalar_t *, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits >; + const scalar_t *, Layout, typename ValuesType::device_type, + Kokkos::MemoryTraits>; static constexpr size_type MAX_VEC_SIZE = 11; - static constexpr size_type BUFF_SIZE = 128; + static constexpr size_type BUFF_SIZE = 128; using reftype = Vector; - struct ArrayType - { + struct ArrayType { scalar_t m_data[MAX_VEC_SIZE]; KOKKOS_INLINE_FUNCTION ArrayType() { init(); } KOKKOS_INLINE_FUNCTION - ArrayType(const ArrayType& rhs_) { + ArrayType(const ArrayType &rhs_) { for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = rhs_.m_data[i]; } KOKKOS_INLINE_FUNCTION - ArrayType(const Vector&) { init(); } + ArrayType(const Vector &) { init(); } KOKKOS_INLINE_FUNCTION void init() { @@ -288,42 +284,40 @@ struct SptrsvWrap { } KOKKOS_INLINE_FUNCTION - ArrayType& operator +=(const ArrayType& rhs_) { - for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] += rhs_.m_data[i]; + ArrayType &operator+=(const ArrayType &rhs_) { + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) + m_data[i] += rhs_.m_data[i]; return *this; } KOKKOS_INLINE_FUNCTION - ArrayType& operator +=(const values_t& rhs_) { + ArrayType &operator+=(const values_t &rhs_) { for (int i = 0; i < rhs_.size(); ++i) m_data[i] += rhs_(i); return *this; } }; - struct SumArray - { - using reducer = SumArray; + struct SumArray { + using reducer = SumArray; using value_type = ArrayType; - using result_view_type = Kokkos::View; + using result_view_type = + Kokkos::View; - private: - value_type& m_value; + private: + value_type &m_value; - public: + public: KOKKOS_INLINE_FUNCTION - SumArray(value_type& value) : m_value(value) {} + SumArray(value_type &value) : m_value(value) {} KOKKOS_INLINE_FUNCTION - void join(value_type& dest, const value_type& src) const { - dest += src; - } - + void join(value_type &dest, const value_type &src) const { dest += src; } KOKKOS_INLINE_FUNCTION - void init(value_type& val) const { val.init(); } + void init(value_type &val) const { val.init(); } KOKKOS_INLINE_FUNCTION - value_type& reference() const { return m_value; } + value_type &reference() const { return m_value; } KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(&m_value, 1); } @@ -341,11 +335,8 @@ struct SptrsvWrap { size_type block_size; size_type block_items; - Common(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, - LHSType &lhs_, - const RHSType &rhs_, + Common(const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_) : row_map(row_map_), @@ -355,11 +346,11 @@ struct SptrsvWrap { rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), block_size(block_size_), - block_items(block_size * block_size) - { + block_items(block_size * block_size) { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); - KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, "Max supported block size is " << MAX_VEC_SIZE); + KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, + "Max supported block size is " << MAX_VEC_SIZE); } KOKKOS_INLINE_FUNCTION @@ -390,27 +381,28 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION static void assign(const member_type &team, const View1 &lhs_, const View2 &rhs_) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs_.size()), - [&](const size_type i) { - lhs_.data()[i] = rhs_.data()[i]; - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, lhs_.size()), + [&](const size_type i) { lhs_.data()[i] = rhs_.data()[i]; }); } // add. y += x KOKKOS_INLINE_FUNCTION - static void add(const member_type &team, const CVector& x, const Vector& y) { + static void add(const member_type &team, const CVector &x, + const Vector &y) { KokkosBlas::Experimental::axpy(team, 1.0, x, y); } // serial add. y += x KOKKOS_INLINE_FUNCTION - static void add(const CVector& x, const Vector& y) { + static void add(const CVector &x, const Vector &y) { KokkosBlas::serial_axpy(1.0, x, y); } // divide. b /= A (b = b * A^-1) KOKKOS_INLINE_FUNCTION - static void divide(const member_type &team, const Vector &b, const CBlock &A) { + static void divide(const member_type &team, const Vector &b, + const CBlock &A) { // Team-shared buffer. Use for team work. const auto block_size = b.size(); SBlock shared_buff(team.team_shmem(), block_size, block_size); @@ -450,15 +442,19 @@ struct SptrsvWrap { // A = LU // A^-1 = U^-1 * L^-1 // b = (b * U^-1) * L^-1, so do U trsv first - KokkosBatched::SerialTrsv< - KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); - - KokkosBatched::SerialTrsv< - KokkosBatched::Uplo::Lower, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); + KokkosBatched::SerialTrsv::invoke(1.0, + LU, + b); + + KokkosBatched::SerialTrsv::invoke(1.0, + LU, + b); } // multiply_subtract. C -= A * B @@ -473,19 +469,21 @@ struct SptrsvWrap { static void multiply_subtract(const CBlock &A, const CVector &B, Vector &C) { // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 - KokkosBlas::SerialGemv< - KokkosBlas::Trans::NoTranspose, KokkosBlas::Algo::Gemv::Blocked>:: - invoke(-1.0, A, B, 1.0, C); + KokkosBlas::SerialGemv::invoke(-1.0, A, + B, 1.0, + C); } KOKKOS_INLINE_FUNCTION - static void copy(const member_type &team, const Vector& lhs_, ArrayType& rhsa) { + static void copy(const member_type &team, const Vector &lhs_, + ArrayType &rhsa) { CVector rhs_(&rhsa.m_data[0], lhs_.size()); assign(team, lhs_, rhs_); } KOKKOS_INLINE_FUNCTION - static void copy(const Vector& lhs_, ArrayType& rhsa) { + static void copy(const Vector &lhs_, ArrayType &rhsa) { CVector rhs_(&rhsa.m_data[0], lhs_.size()); assign(lhs_, rhs_); } @@ -505,13 +503,14 @@ struct SptrsvWrap { // vget KOKKOS_INLINE_FUNCTION CBlock vget(const size_type block) const { - return CBlock(values.data() + (block * block_items), block_size, block_size); + return CBlock(values.data() + (block * block_items), block_size, + block_size); } // lhs = (lhs + rhs) / diag KOKKOS_INLINE_FUNCTION - static void add_and_divide(const member_type& team, const Vector &lhs_val, const CVector &rhs_val, - const CBlock &diag_val) { + static void add_and_divide(const member_type &team, const Vector &lhs_val, + const CVector &rhs_val, const CBlock &diag_val) { add(team, rhs_val, lhs_val); divide(team, lhs_val, diag_val); } @@ -523,7 +522,6 @@ struct SptrsvWrap { divide(lhs_val, diag_val); } - // print KOKKOS_INLINE_FUNCTION static void print(const CBlock &item) { @@ -548,8 +546,7 @@ struct SptrsvWrap { } KOKKOS_INLINE_FUNCTION - static void print(const ArrayType& rhs_, const int block_size) - { + static void print(const ArrayType &rhs_, const int block_size) { std::cout << "Array: "; for (int i = 0; i < block_size; ++i) { std::cout << rhs_.m_data[i] << " "; @@ -558,8 +555,7 @@ struct SptrsvWrap { } KOKKOS_INLINE_FUNCTION - static void print(const SumArray& rhs_, const int block_size) - { + static void print(const SumArray &rhs_, const int block_size) { std::cout << "SumArray: "; for (int i = 0; i < block_size; ++i) { std::cout << rhs_.reference().m_data[i] << " "; @@ -574,12 +570,12 @@ struct SptrsvWrap { */ template - struct Intermediate : - public Common { + struct Intermediate : public Common { using Base = Common; - using accum_t = std::conditional_t; + using accum_t = + std::conditional_t; Intermediate(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, @@ -598,7 +594,7 @@ struct SptrsvWrap { const auto colid = m_obj->entries(i); auto val = m_obj->vget(i); auto lhs_colid = m_obj->lget(colid); - //accum -= val * lhs_colid; + // accum -= val * lhs_colid; Base::multiply_subtract(val, lhs_colid, accum); KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); } @@ -615,7 +611,7 @@ struct SptrsvWrap { if (colid != rowid) { auto val = m_obj->vget(i); auto lhs_colid = m_obj->lget(colid); - //accum -= val * lhs_colid; + // accum -= val * lhs_colid; Base::multiply_subtract(val, lhs_colid, accum); } else { diag = i; @@ -630,7 +626,8 @@ struct SptrsvWrap { const long node_count) const { using reduce_item_t = typename Base::ArrayType; using reducer_t = typename Base::SumArray; - using functor_t = std::conditional_t; + using functor_t = + std::conditional_t; static_assert( !((!IsSerial && BlockEnabled) && UseThreadVec), @@ -650,7 +647,7 @@ struct SptrsvWrap { // We don't need the reducer to find the diag item if sorted functor_t rf{this, rowid, -1}; typename Base::reftype lhs_val = Base::lget(rowid); - reduce_item_t reduce = lhs_val; + reduce_item_t reduce = lhs_val; if constexpr (IsSerial) { KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); @@ -669,7 +666,8 @@ struct SptrsvWrap { Base::copy(*team, lhs_val, reduce); } else { Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); + Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, + reducer_t(reduce)); Base::copy(lhs_val, reduce); } } @@ -714,10 +712,10 @@ struct SptrsvWrap { template struct TriLvlSchedTP1SolverFunctor - : public Intermediate { - using Base = Intermediate; + : public Intermediate { + using Base = Intermediate; long node_count; // like "block" offset into ngbl, my_league is the "local" // offset @@ -749,10 +747,10 @@ struct SptrsvWrap { template struct TriLvlSchedRPSolverFunctor - : public Intermediate { - using Base = Intermediate; + : public Intermediate { + using Base = Intermediate; TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, @@ -777,10 +775,10 @@ struct SptrsvWrap { template struct TriLvlSchedTP1SingleBlockFunctor - : public Intermediate { - using Base = - Intermediate; + : public Intermediate { + using Base = Intermediate; entries_t nodes_per_level; @@ -1544,8 +1542,8 @@ struct SptrsvWrap { Functor - template + template static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, @@ -1653,7 +1651,8 @@ struct SptrsvWrap { auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); - const int scratch_size = LowerTPFunc::SBlock::shmem_size(block_size, block_size); + const int scratch_size = + LowerTPFunc::SBlock::shmem_size(block_size, block_size); tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_l_team", @@ -1901,8 +1900,8 @@ struct SptrsvWrap { #endif } // end lower_tri_solve - template + template static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, @@ -2011,7 +2010,8 @@ struct SptrsvWrap { auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); - const int scratch_size = UpperTPFunc::SBlock::shmem_size(block_size, block_size); + const int scratch_size = + UpperTPFunc::SBlock::shmem_size(block_size, block_size); tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_u_team", diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index d8deee382c..641bb1b05c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -124,7 +124,7 @@ struct SPTRSV_SOLVE; // Call specific algorithm type - auto sptrsv_handle = handle->get_sptrsv_handle(); + auto sptrsv_handle = handle->get_sptrsv_handle(); const auto block_enabled = sptrsv_handle->is_block_enabled(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() ? "KokkosSparse_sptrsv[lower]" @@ -149,12 +149,11 @@ struct SPTRSV_SOLVE(space, *sptrsv_handle, row_map, entries, - values, b, x); - } - else { - Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::template lower_tri_solve( + space, *sptrsv_handle, row_map, entries, values, b, x); + } else { + Sptrsv::template lower_tri_solve( + space, *sptrsv_handle, row_map, entries, values, b, x); } } } @@ -178,12 +177,11 @@ struct SPTRSV_SOLVE(space, *sptrsv_handle, row_map, entries, - values, b, x); - } - else { - Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::template upper_tri_solve( + space, *sptrsv_handle, row_map, entries, values, b, x); + } else { + Sptrsv::template upper_tri_solve( + space, *sptrsv_handle, row_map, entries, values, b, x); } } } diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 089213a823..75e5fd67b3 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -61,7 +61,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: //! Constructor: template - LUPrec(const CRSArg &L, const CRSArg &U, const size_type block_size=0) + LUPrec(const CRSArg &L, const CRSArg &U, const size_type block_size = 0) : _L(L), _U(U), _tmp("LUPrec::_tmp", L.numPointRows()), diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 3aa4abbf31..94df164622 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -819,7 +819,7 @@ struct SpilukTest { // Make precond. KokkosSparse::Experimental::LUPrec - myPrec(L, U, UseBlocks ? block_size : 0); + myPrec(L, U, UseBlocks ? block_size : 0); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 2c23a5ae67..9f41756fc1 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -75,12 +75,13 @@ struct SptrsvTest { } static std::vector> get_6x6_ut_ones_fixture() { - std::vector> A = {{1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, - {0.00, 1.00, 0.00, 0.00, 0.00, 1.00}, - {0.00, 0.00, 1.00, 1.00, 0.00, 1.00}, - {0.00, 0.00, 0.00, 1.00, 0.00, 1.00}, - {0.00, 0.00, 0.00, 0.00, 1.00, 1.00}, - {0.00, 0.00, 0.00, 0.00, 0.00, 1.00}}; + std::vector> A = { + {1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 1.00, 0.00, 0.00, 0.00, 1.00}, + {0.00, 0.00, 1.00, 1.00, 0.00, 1.00}, + {0.00, 0.00, 0.00, 1.00, 0.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 1.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 0.00, 1.00}}; return A; } @@ -114,12 +115,13 @@ struct SptrsvTest { } static std::vector> get_6x6_lt_ones_fixture() { - std::vector> A = {{1.00, 0.00, 0.00, 0.00, 0.00, 0.00}, - {1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, - {0.00, 0.00, 1.00, 0.00, 0.00, 0.00}, - {0.00, 0.00, 0.00, 1.00, 0.00, 0.00}, - {0.00, 0.00, 0.00, 1.00, 1.00, 0.00}, - {0.00, 1.00, 1.00, 1.00, 1.00, 1.00}}; + std::vector> A = { + {1.00, 0.00, 0.00, 0.00, 0.00, 0.00}, + {1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 1.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 1.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 1.00, 1.00, 0.00}, + {0.00, 1.00, 1.00, 1.00, 1.00, 1.00}}; return A; } @@ -514,9 +516,10 @@ struct SptrsvTest { } } - static void run_test_sptrsv_blocks_impl(const bool is_lower, const size_type block_size) { - - auto fixture = is_lower ? get_6x6_lt_ones_fixture() : get_6x6_ut_ones_fixture(); + static void run_test_sptrsv_blocks_impl(const bool is_lower, + const size_type block_size) { + auto fixture = + is_lower ? get_6x6_lt_ones_fixture() : get_6x6_ut_ones_fixture(); const auto [triMtx_crs, lhs, rhs] = create_crs_lhs_rhs(fixture); Bsr triMtx(triMtx_crs, block_size); From b1ca56e4da2fc3f74ddd7cf6f93598aef850b314 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 29 Jul 2024 13:54:32 -0600 Subject: [PATCH 29/41] Update work to latest format style --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 321 ++++++------------ .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 17 +- sparse/src/KokkosSparse_LUPrec.hpp | 32 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 3 +- sparse/unit_test/Test_Sparse_sptrsv.hpp | 29 +- 5 files changed, 136 insertions(+), 266 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 89e72b6523..5860e1b946 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -129,18 +129,15 @@ struct SptrsvWrap { static int shmem_size(size_type, size_type) { return 0; } }; - Common(const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const size_type block_size_ = 0) + Common(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) : row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_) { - KK_REQUIRE_MSG(block_size_ == 0, - "Tried to use blocks with the unblocked Common?"); + KK_REQUIRE_MSG(block_size_ == 0, "Tried to use blocks with the unblocked Common?"); } KOKKOS_INLINE_FUNCTION @@ -148,9 +145,7 @@ struct SptrsvWrap { // lset KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const scalar_t value) const { - lhs(row) = value; - } + void lset(const size_type row, const scalar_t value) const { lhs(row) = value; } // add. y += x KOKKOS_INLINE_FUNCTION @@ -165,8 +160,7 @@ struct SptrsvWrap { // divide. b /= A KOKKOS_INLINE_FUNCTION - static void divide(const member_type &team, scalar_t &b, const scalar_t &A, - scalar_t *) { + static void divide(const member_type &team, scalar_t &b, const scalar_t &A, scalar_t *) { Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); team.team_barrier(); } @@ -177,10 +171,7 @@ struct SptrsvWrap { // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const scalar_t &A, const scalar_t &B, - scalar_t &C) { - C -= A * B; - } + static void multiply_subtract(const scalar_t &A, const scalar_t &B, scalar_t &C) { C -= A * B; } KOKKOS_INLINE_FUNCTION static void copy(const member_type &, scalar_t &, const scalar_t &) {} @@ -202,17 +193,14 @@ struct SptrsvWrap { // lhs = (lhs + rhs) / diag (team) KOKKOS_INLINE_FUNCTION - static void add_and_divide(const member_type &team, scalar_t &lhs_val, - const scalar_t &rhs_val, + static void add_and_divide(const member_type &team, scalar_t &lhs_val, const scalar_t &rhs_val, const scalar_t &diag_val) { - Kokkos::single(Kokkos::PerTeam(team), - [&]() { lhs_val = (lhs_val + rhs_val) / diag_val; }); + Kokkos::single(Kokkos::PerTeam(team), [&]() { lhs_val = (lhs_val + rhs_val) / diag_val; }); } // lhs = (lhs + rhs) / diag (serial) KOKKOS_INLINE_FUNCTION - static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, - const scalar_t &diag_val) { + static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, const scalar_t &diag_val) { lhs_val = (lhs_val + rhs_val) / diag_val; } @@ -221,39 +209,31 @@ struct SptrsvWrap { static void print(const scalar_t &item) { std::cout << item << std::endl; } KOKKOS_INLINE_FUNCTION - static void print(ArrayType rhs, const int) { - std::cout << rhs << std::endl; - } + static void print(ArrayType rhs, const int) { std::cout << rhs << std::endl; } }; // Partial specialization for block support - template + template struct Common { // BSR data is in LayoutRight! using Layout = Kokkos::LayoutRight; - using Block = Kokkos::View< - scalar_t **, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits>; + using Block = Kokkos::View>; // const block - using CBlock = Kokkos::View< - const scalar_t **, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits>; + using CBlock = Kokkos::View>; // scratch block - using SBlock = Kokkos::View< - scalar_t **, Layout, typename execution_space::scratch_memory_space, - Kokkos::MemoryTraits>; + using SBlock = Kokkos::View>; - using Vector = Kokkos::View< - scalar_t *, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits>; + using Vector = Kokkos::View>; - using CVector = Kokkos::View< - const scalar_t *, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits>; + using CVector = Kokkos::View>; static constexpr size_type MAX_VEC_SIZE = 11; static constexpr size_type BUFF_SIZE = 128; @@ -281,8 +261,7 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION ArrayType &operator+=(const ArrayType &rhs_) { - for (size_type i = 0; i < MAX_VEC_SIZE; ++i) - m_data[i] += rhs_.m_data[i]; + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] += rhs_.m_data[i]; return *this; } @@ -294,10 +273,9 @@ struct SptrsvWrap { }; struct SumArray { - using reducer = SumArray; - using value_type = ArrayType; - using result_view_type = - Kokkos::View; + using reducer = SumArray; + using value_type = ArrayType; + using result_view_type = Kokkos::View; private: value_type &m_value; @@ -331,10 +309,8 @@ struct SptrsvWrap { size_type block_size; size_type block_items; - Common(const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const size_type block_size_) + Common(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_) : row_map(row_map_), entries(entries_), values(values_), @@ -343,10 +319,8 @@ struct SptrsvWrap { nodes_grouped_by_level(nodes_grouped_by_level_), block_size(block_size_), block_items(block_size * block_size) { - KK_REQUIRE_MSG(block_size > 0, - "Tried to use block_size=0 with the blocked Common?"); - KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, - "Max supported block size is " << MAX_VEC_SIZE); + KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); + KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, "Max supported block size is " << MAX_VEC_SIZE); } KOKKOS_INLINE_FUNCTION @@ -354,9 +328,7 @@ struct SptrsvWrap { // lset KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const scalar_t &value) const { - KokkosBlas::SerialSet::invoke(value, lget(row)); - } + void lset(const size_type row, const scalar_t &value) const { KokkosBlas::SerialSet::invoke(value, lget(row)); } KOKKOS_INLINE_FUNCTION void lset(const size_type row, const CVector &rhs_) const { @@ -366,39 +338,31 @@ struct SptrsvWrap { // assign template - KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs_, - const View2 &rhs_) { + KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs_, const View2 &rhs_) { for (size_t i = 0; i < lhs_.size(); ++i) { lhs_.data()[i] = rhs_.data()[i]; } } template - KOKKOS_INLINE_FUNCTION static void assign(const member_type &team, - const View1 &lhs_, - const View2 &rhs_) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, lhs_.size()), - [&](const size_type i) { lhs_.data()[i] = rhs_.data()[i]; }); + KOKKOS_INLINE_FUNCTION static void assign(const member_type &team, const View1 &lhs_, const View2 &rhs_) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs_.size()), + [&](const size_type i) { lhs_.data()[i] = rhs_.data()[i]; }); } // add. y += x KOKKOS_INLINE_FUNCTION - static void add(const member_type &team, const CVector &x, - const Vector &y) { + static void add(const member_type &team, const CVector &x, const Vector &y) { KokkosBlas::Experimental::axpy(team, 1.0, x, y); } // serial add. y += x KOKKOS_INLINE_FUNCTION - static void add(const CVector &x, const Vector &y) { - KokkosBlas::serial_axpy(1.0, x, y); - } + static void add(const CVector &x, const Vector &y) { KokkosBlas::serial_axpy(1.0, x, y); } // divide. b /= A (b = b * A^-1) KOKKOS_INLINE_FUNCTION - static void divide(const member_type &team, const Vector &b, - const CBlock &A) { + static void divide(const member_type &team, const Vector &b, const CBlock &A) { // Team-shared buffer. Use for team work. const auto block_size = b.size(); SBlock shared_buff(team.team_shmem(), block_size, block_size); @@ -406,21 +370,17 @@ struct SptrsvWrap { // Need a temp block to do LU of A Block LU(shared_buff.data(), block_size, block_size); assign(team, LU, A); - KokkosBatched::TeamLU::invoke(team, LU); + KokkosBatched::TeamLU::invoke(team, LU); // A = LU // A^-1 = U^-1 * L^-1 // b = (b * U^-1) * L^-1, so do U trsv first - KokkosBatched::TeamTrsv< - member_type, KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); - - KokkosBatched::TeamTrsv< - member_type, KokkosBatched::Uplo::Lower, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); + KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, + b); + + KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); } // serial divide. b /= A (b = b * A^-1) @@ -438,42 +398,29 @@ struct SptrsvWrap { // A = LU // A^-1 = U^-1 * L^-1 // b = (b * U^-1) * L^-1, so do U trsv first - KokkosBatched::SerialTrsv::invoke(1.0, - LU, - b); - - KokkosBatched::SerialTrsv::invoke(1.0, - LU, - b); + KokkosBatched::SerialTrsv::invoke(1.0, LU, b); + + KokkosBatched::SerialTrsv::invoke(1.0, LU, b); } // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const CBlock &A, const CVector &B, - ArrayType &Ca) { + static void multiply_subtract(const CBlock &A, const CVector &B, ArrayType &Ca) { Vector C(&Ca.m_data[0], B.size()); multiply_subtract(A, B, C); } KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const CBlock &A, const CVector &B, - Vector &C) { + static void multiply_subtract(const CBlock &A, const CVector &B, Vector &C) { // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 - KokkosBlas::SerialGemv::invoke(-1.0, A, - B, 1.0, - C); + KokkosBlas::SerialGemv::invoke(-1.0, A, B, 1.0, + C); } KOKKOS_INLINE_FUNCTION - static void copy(const member_type &team, const Vector &lhs_, - ArrayType &rhsa) { + static void copy(const member_type &team, const Vector &lhs_, ArrayType &rhsa) { CVector rhs_(&rhsa.m_data[0], lhs_.size()); assign(team, lhs_, rhs_); } @@ -486,34 +433,28 @@ struct SptrsvWrap { // lget KOKKOS_INLINE_FUNCTION - Vector lget(const size_type row) const { - return Vector(lhs.data() + (row * block_size), block_size); - } + Vector lget(const size_type row) const { return Vector(lhs.data() + (row * block_size), block_size); } // rget KOKKOS_INLINE_FUNCTION - CVector rget(const size_type row) const { - return CVector(rhs.data() + (row * block_size), block_size); - } + CVector rget(const size_type row) const { return CVector(rhs.data() + (row * block_size), block_size); } // vget KOKKOS_INLINE_FUNCTION CBlock vget(const size_type block) const { - return CBlock(values.data() + (block * block_items), block_size, - block_size); + return CBlock(values.data() + (block * block_items), block_size, block_size); } // lhs = (lhs + rhs) / diag KOKKOS_INLINE_FUNCTION - static void add_and_divide(const member_type &team, const Vector &lhs_val, - const CVector &rhs_val, const CBlock &diag_val) { + static void add_and_divide(const member_type &team, const Vector &lhs_val, const CVector &rhs_val, + const CBlock &diag_val) { add(team, rhs_val, lhs_val); divide(team, lhs_val, diag_val); } KOKKOS_INLINE_FUNCTION - static void add_and_divide(const Vector &lhs_val, const CVector &rhs_val, - const CBlock &diag_val) { + static void add_and_divide(const Vector &lhs_val, const CVector &rhs_val, const CBlock &diag_val) { add(rhs_val, lhs_val); divide(lhs_val, diag_val); } @@ -564,21 +505,14 @@ struct SptrsvWrap { * Intermediate class that contains implementation that identical * for blocked / non-blocked */ - template - struct Intermediate : public Common { - using Base = Common; - using accum_t = - std::conditional_t; - - Intermediate(const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const size_type block_size_ = 0) - : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, - block_size_) {} + template + struct Intermediate : public Common { + using Base = Common; + using accum_t = std::conditional_t; + + Intermediate(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) {} struct ReduceSumFunctor { const Base *m_obj; @@ -615,21 +549,15 @@ struct SptrsvWrap { } }; - template - KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, - const int my_rank, - const long node_count) const { + template + KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, const int my_rank, const long node_count) const { using reduce_item_t = typename Base::ArrayType; using reducer_t = typename Base::SumArray; - using functor_t = - std::conditional_t; + using functor_t = std::conditional_t; - static_assert( - !((!IsSerial && BlockEnabled) && UseThreadVec), - "ThreadVectorRanges are not yet supported for block-enabled"); - static_assert(!(IsSerial && UseThreadVec), - "Requested thread vector range in serial?"); + static_assert(!((!IsSerial && BlockEnabled) && UseThreadVec), + "ThreadVectorRanges are not yet supported for block-enabled"); + static_assert(!(IsSerial && UseThreadVec), "Requested thread vector range in serial?"); const auto rowid = Base::nodes_grouped_by_level(my_rank + node_count); const auto soffset = Base::row_map(rowid); @@ -655,14 +583,11 @@ struct SptrsvWrap { } else { KK_KERNEL_ASSERT_MSG(team != nullptr, "Cannot do team operations without team"); if constexpr (!UseThreadVec) { - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), - rf, reducer_t(reduce)); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); team->team_barrier(); Base::copy(*team, lhs_val, reduce); } else { - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, - reducer_t(reduce)); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); Base::copy(lhs_val, reduce); } } @@ -704,13 +629,11 @@ struct SptrsvWrap { // TriLvlSched functors // - template + template struct TriLvlSchedTP1SolverFunctor - : public Intermediate { - using Base = Intermediate; + : public Intermediate { + using Base = Intermediate; long node_count; // like "block" offset into ngbl, my_league is the "local" // offset @@ -732,19 +655,14 @@ struct SptrsvWrap { } }; - template + template struct TriLvlSchedRPSolverFunctor - : public Intermediate { - using Base = Intermediate; - - TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, + : public Intermediate { + using Base = Intermediate; + + TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, + LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) {} @@ -759,10 +677,8 @@ struct SptrsvWrap { template struct TriLvlSchedTP1SingleBlockFunctor - : public Intermediate { - using Base = Intermediate; + : public Intermediate { + using Base = Intermediate; entries_t nodes_per_level; @@ -1407,17 +1323,12 @@ struct SptrsvWrap { #endif -#define FunctorTypeMacro(Functor, IsLower, BlockEnabled) \ - Functor - - template - static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, - const RowMapType row_map, - const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { +#define FunctorTypeMacro(Functor, IsLower, BlockEnabled) \ + Functor + + template + static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, + const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif @@ -1433,10 +1344,8 @@ struct SptrsvWrap { assert(block_enabled == BlockEnabled); // Set up functor types - using LowerRPFunc = - FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, BlockEnabled); - using LowerTPFunc = - FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, BlockEnabled); + using LowerRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, BlockEnabled); + using LowerTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, BlockEnabled); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -1507,12 +1416,10 @@ struct SptrsvWrap { } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { LowerTPFunc ltpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); - auto tp = team_size == -1 - ? team_policy(space, lvl_nodes, Kokkos::AUTO) - : team_policy(space, lvl_nodes, team_size); - const int scratch_size = - LowerTPFunc::SBlock::shmem_size(block_size, block_size); - tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + auto tp = + team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + const int scratch_size = LowerTPFunc::SBlock::shmem_size(block_size, block_size); + tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_l_team", Kokkos::Experimental::require(tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), ltpp); @@ -1711,13 +1618,9 @@ struct SptrsvWrap { #endif } // end lower_tri_solve - template - static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, - const RowMapType row_map, - const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { + template + static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, + const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif @@ -1735,10 +1638,8 @@ struct SptrsvWrap { assert(block_enabled == BlockEnabled); // Set up functor types - using UpperRPFunc = - FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, BlockEnabled); - using UpperTPFunc = - FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, BlockEnabled); + using UpperRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, BlockEnabled); + using UpperTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, BlockEnabled); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -1808,12 +1709,10 @@ struct SptrsvWrap { } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { UpperTPFunc utpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); - auto tp = team_size == -1 - ? team_policy(space, lvl_nodes, Kokkos::AUTO) - : team_policy(space, lvl_nodes, team_size); - const int scratch_size = - UpperTPFunc::SBlock::shmem_size(block_size, block_size); - tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + auto tp = + team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + const int scratch_size = UpperTPFunc::SBlock::shmem_size(block_size, block_size); + tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_u_team", Kokkos::Experimental::require(tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), utpp); diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index b104335dd9..87cf72686c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -105,9 +105,8 @@ struct SPTRSV_SOLVEget_sptrsv_handle(); const auto block_enabled = sptrsv_handle->is_block_enabled(); - Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() - ? "KokkosSparse_sptrsv[lower]" - : "KokkosSparse_sptrsv[upper]"); + Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() ? "KokkosSparse_sptrsv[lower]" + : "KokkosSparse_sptrsv[upper]"); if (sptrsv_handle->is_lower_tri()) { if (sptrsv_handle->is_symbolic_complete() == false) { Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, entries); @@ -124,11 +123,9 @@ struct SPTRSV_SOLVE( - space, *sptrsv_handle, row_map, entries, values, b, x); + Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } else { - Sptrsv::template lower_tri_solve( - space, *sptrsv_handle, row_map, entries, values, b, x); + Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } } @@ -148,11 +145,9 @@ struct SPTRSV_SOLVE( - space, *sptrsv_handle, row_map, entries, values, b, x); + Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } else { - Sptrsv::template upper_tri_solve( - space, *sptrsv_handle, row_map, entries, values, b, x); + Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } } diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 75e5fd67b3..a4b62a28ba 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -62,19 +62,11 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { //! Constructor: template LUPrec(const CRSArg &L, const CRSArg &U, const size_type block_size = 0) - : _L(L), - _U(U), - _tmp("LUPrec::_tmp", L.numPointRows()), - _tmp2("LUPrec::_tmp", L.numPointRows()), - _khL(), - _khU() { - KK_REQUIRE_MSG(L.numPointRows() == U.numPointRows(), - "LUPrec: L.numRows() != U.numRows()"); - - _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), - true, block_size); - _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), - false, block_size); + : _L(L), _U(U), _tmp("LUPrec::_tmp", L.numPointRows()), _tmp2("LUPrec::_tmp", L.numPointRows()), _khL(), _khU() { + KK_REQUIRE_MSG(L.numPointRows() == U.numPointRows(), "LUPrec: L.numRows() != U.numRows()"); + + _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), true, block_size); + _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), false, block_size); } //! Destructor. @@ -94,20 +86,16 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { ///// ///// It takes L and U and the stores U^inv L^inv X in Y // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { - KK_REQUIRE_MSG(transM[0] == NoTranspose[0], - "LUPrec::apply only supports 'N' for transM"); + virtual void apply(const Kokkos::View &X, const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], "LUPrec::apply only supports 'N' for transM"); sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); - sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, - _tmp2); + sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, _tmp2); KokkosBlas::axpby(alpha, _tmp2, beta, Y); } diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 02486d622d..1fde1ac5ed 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -740,8 +740,7 @@ struct SpilukTest { gmres_handle->set_verbose(verbose); // Make precond. - KokkosSparse::Experimental::LUPrec - myPrec(L, U, UseBlocks ? block_size : 0); + KokkosSparse::Experimental::LUPrec myPrec(L, U, UseBlocks ? block_size : 0); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 4faa191557..91403bb434 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -74,13 +74,9 @@ struct SptrsvTest { } static std::vector> get_6x6_ut_ones_fixture() { - std::vector> A = { - {1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, - {0.00, 1.00, 0.00, 0.00, 0.00, 1.00}, - {0.00, 0.00, 1.00, 1.00, 0.00, 1.00}, - {0.00, 0.00, 0.00, 1.00, 0.00, 1.00}, - {0.00, 0.00, 0.00, 0.00, 1.00, 1.00}, - {0.00, 0.00, 0.00, 0.00, 0.00, 1.00}}; + std::vector> A = {{1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, {0.00, 1.00, 0.00, 0.00, 0.00, 1.00}, + {0.00, 0.00, 1.00, 1.00, 0.00, 1.00}, {0.00, 0.00, 0.00, 1.00, 0.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 1.00, 1.00}, {0.00, 0.00, 0.00, 0.00, 0.00, 1.00}}; return A; } @@ -114,13 +110,9 @@ struct SptrsvTest { } static std::vector> get_6x6_lt_ones_fixture() { - std::vector> A = { - {1.00, 0.00, 0.00, 0.00, 0.00, 0.00}, - {1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, - {0.00, 0.00, 1.00, 0.00, 0.00, 0.00}, - {0.00, 0.00, 0.00, 1.00, 0.00, 0.00}, - {0.00, 0.00, 0.00, 1.00, 1.00, 0.00}, - {0.00, 1.00, 1.00, 1.00, 1.00, 1.00}}; + std::vector> A = {{1.00, 0.00, 0.00, 0.00, 0.00, 0.00}, {1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 1.00, 0.00, 0.00, 0.00}, {0.00, 0.00, 0.00, 1.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 1.00, 1.00, 0.00}, {0.00, 1.00, 1.00, 1.00, 1.00, 1.00}}; return A; } @@ -497,10 +489,8 @@ struct SptrsvTest { } } - static void run_test_sptrsv_blocks_impl(const bool is_lower, - const size_type block_size) { - auto fixture = - is_lower ? get_6x6_lt_ones_fixture() : get_6x6_ut_ones_fixture(); + static void run_test_sptrsv_blocks_impl(const bool is_lower, const size_type block_size) { + auto fixture = is_lower ? get_6x6_lt_ones_fixture() : get_6x6_ut_ones_fixture(); const auto [triMtx_crs, lhs, rhs] = create_crs_lhs_rhs(fixture); Bsr triMtx(triMtx_crs, block_size); @@ -514,8 +504,7 @@ struct SptrsvTest { } } - static void run_test_sptrsv_streams(SPTRSVAlgorithm test_algo, int nstreams, - const bool is_lower) { + static void run_test_sptrsv_streams(SPTRSVAlgorithm test_algo, int nstreams, const bool is_lower) { // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition bool run_streams_test = true; From 12d8101598952ec05f12e238aa1fbcd95b136e8b Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 29 Jul 2024 14:28:56 -0600 Subject: [PATCH 30/41] Remove unused functions. Remove prints. Add barriers --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 75 ++----------------- 1 file changed, 5 insertions(+), 70 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 5860e1b946..6d8971cd91 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -147,28 +147,6 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION void lset(const size_type row, const scalar_t value) const { lhs(row) = value; } - // add. y += x - KOKKOS_INLINE_FUNCTION - static void add(const member_type &team, const scalar_t &x, scalar_t &y) { - Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); - team.team_barrier(); - } - - // serial add. y += x - KOKKOS_INLINE_FUNCTION - static void add(const scalar_t &x, scalar_t &y) { y += x; } - - // divide. b /= A - KOKKOS_INLINE_FUNCTION - static void divide(const member_type &team, scalar_t &b, const scalar_t &A, scalar_t *) { - Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); - team.team_barrier(); - } - - // serial divide. b /= A - KOKKOS_INLINE_FUNCTION - static void divide(scalar_t &b, const scalar_t &A, scalar_t *) { b /= A; } - // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION static void multiply_subtract(const scalar_t &A, const scalar_t &B, scalar_t &C) { C -= A * B; } @@ -203,13 +181,6 @@ struct SptrsvWrap { static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, const scalar_t &diag_val) { lhs_val = (lhs_val + rhs_val) / diag_val; } - - // print - KOKKOS_INLINE_FUNCTION - static void print(const scalar_t &item) { std::cout << item << std::endl; } - - KOKKOS_INLINE_FUNCTION - static void print(ArrayType rhs, const int) { std::cout << rhs << std::endl; } }; // Partial specialization for block support @@ -370,15 +341,18 @@ struct SptrsvWrap { // Need a temp block to do LU of A Block LU(shared_buff.data(), block_size, block_size); assign(team, LU, A); + team.team_barrier(); KokkosBatched::TeamLU::invoke(team, LU); // A = LU // A^-1 = U^-1 * L^-1 // b = (b * U^-1) * L^-1, so do U trsv first + team.team_barrier(); KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); + team.team_barrier(); KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); } @@ -450,6 +424,7 @@ struct SptrsvWrap { static void add_and_divide(const member_type &team, const Vector &lhs_val, const CVector &rhs_val, const CBlock &diag_val) { add(team, rhs_val, lhs_val); + team.team_barrier(); divide(team, lhs_val, diag_val); } @@ -458,47 +433,6 @@ struct SptrsvWrap { add(rhs_val, lhs_val); divide(lhs_val, diag_val); } - - // print - KOKKOS_INLINE_FUNCTION - static void print(const CBlock &item) { - std::cout << "Block: "; - for (size_type i = 0; i < item.extent(0); ++i) { - std::cout << " "; - for (size_type j = 0; j < item.extent(1); ++j) { - std::cout << item(i, j) << " "; - } - std::cout << std::endl; - } - } - - // print - KOKKOS_INLINE_FUNCTION - static void print(const CVector &item) { - std::cout << "Vector: "; - for (size_type i = 0; i < item.extent(0); ++i) { - std::cout << item(i) << " "; - } - std::cout << std::endl; - } - - KOKKOS_INLINE_FUNCTION - static void print(const ArrayType &rhs_, const int block_size) { - std::cout << "Array: "; - for (int i = 0; i < block_size; ++i) { - std::cout << rhs_.m_data[i] << " "; - } - std::cout << std::endl; - } - - KOKKOS_INLINE_FUNCTION - static void print(const SumArray &rhs_, const int block_size) { - std::cout << "SumArray: "; - for (int i = 0; i < block_size; ++i) { - std::cout << rhs_.reference().m_data[i] << " "; - } - std::cout << std::endl; - } }; /** @@ -586,6 +520,7 @@ struct SptrsvWrap { Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); team->team_barrier(); Base::copy(*team, lhs_val, reduce); + team->team_barrier(); } else { Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); Base::copy(lhs_val, reduce); From 9d7abb721153a1ccf33ffecfc598c5127af67eac Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 1 Aug 2024 17:03:55 -0600 Subject: [PATCH 31/41] Minor fixes --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 6d8971cd91..1e9dab55e9 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -149,7 +149,7 @@ struct SptrsvWrap { // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const scalar_t &A, const scalar_t &B, scalar_t &C) { C -= A * B; } + static void multiply_subtract(const scalar_t &a, const scalar_t &b, scalar_t &c) { c -= a * b; } KOKKOS_INLINE_FUNCTION static void copy(const member_type &, scalar_t &, const scalar_t &) {} @@ -235,12 +235,6 @@ struct SptrsvWrap { for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] += rhs_.m_data[i]; return *this; } - - KOKKOS_INLINE_FUNCTION - ArrayType &operator+=(const values_t &rhs_) { - for (int i = 0; i < rhs_.size(); ++i) m_data[i] += rhs_(i); - return *this; - } }; struct SumArray { @@ -381,16 +375,16 @@ struct SptrsvWrap { // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const CBlock &A, const CVector &B, ArrayType &Ca) { - Vector C(&Ca.m_data[0], B.size()); - multiply_subtract(A, B, C); + static void multiply_subtract(const CBlock &A, const CVector &b, ArrayType &ca) { + Vector c(&ca.m_data[0], b.size()); + multiply_subtract(A, b, c); } KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const CBlock &A, const CVector &B, Vector &C) { + static void multiply_subtract(const CBlock &A, const CVector &b, Vector &c) { // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 - KokkosBlas::SerialGemv::invoke(-1.0, A, B, 1.0, - C); + KokkosBlas::SerialGemv::invoke(-1.0, A, b, 1.0, + c); } KOKKOS_INLINE_FUNCTION @@ -467,7 +461,7 @@ struct SptrsvWrap { struct ReduceSumDiagFunctor { const Base *m_obj; const lno_t rowid; - lno_t diag; + mutable lno_t diag; KOKKOS_INLINE_FUNCTION void operator()(size_type i, accum_t &accum) const { From 42ca742345289e123d91fbbe01f9b17a0afd5f6e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 2 Aug 2024 12:29:06 -0600 Subject: [PATCH 32/41] lset is not used --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 1e9dab55e9..fdaef9cbd1 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -143,10 +143,6 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION size_type get_block_size() const { return 0; } - // lset - KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const scalar_t value) const { lhs(row) = value; } - // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION static void multiply_subtract(const scalar_t &a, const scalar_t &b, scalar_t &c) { c -= a * b; } @@ -291,16 +287,6 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION size_type get_block_size() const { return block_size; } - // lset - KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const scalar_t &value) const { KokkosBlas::SerialSet::invoke(value, lget(row)); } - - KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const CVector &rhs_) const { - auto lvec = lget(row); - assign(lvec, rhs_); - } - // assign template KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs_, const View2 &rhs_) { From 22e17be87ca03b0bf9faeac8d2171fd97828524f Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 2 Aug 2024 12:45:12 -0600 Subject: [PATCH 33/41] Fix for clang --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index fdaef9cbd1..67d49b9704 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -439,7 +439,14 @@ struct SptrsvWrap { auto val = m_obj->vget(i); auto lhs_colid = m_obj->lget(colid); // accum -= val * lhs_colid; - Base::multiply_subtract(val, lhs_colid, accum); + if constexpr(BlockEnabled) { + accum_t temp; + Base::multiply_subtract(val, lhs_colid, temp); + accum += temp; + } + else { + Base::multiply_subtract(val, lhs_colid, accum); + } KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); } }; @@ -456,7 +463,14 @@ struct SptrsvWrap { auto val = m_obj->vget(i); auto lhs_colid = m_obj->lget(colid); // accum -= val * lhs_colid; - Base::multiply_subtract(val, lhs_colid, accum); + if constexpr(BlockEnabled) { + accum_t temp; + Base::multiply_subtract(val, lhs_colid, temp); + accum += temp; + } + else { + Base::multiply_subtract(val, lhs_colid, accum); + } } else { diag = i; } From bdf3f06c1598a89a283f64e27490a9b33ab9e103 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 2 Aug 2024 16:48:49 -0600 Subject: [PATCH 34/41] formatting --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 67d49b9704..d018174f87 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -439,12 +439,11 @@ struct SptrsvWrap { auto val = m_obj->vget(i); auto lhs_colid = m_obj->lget(colid); // accum -= val * lhs_colid; - if constexpr(BlockEnabled) { + if constexpr (BlockEnabled) { accum_t temp; Base::multiply_subtract(val, lhs_colid, temp); accum += temp; - } - else { + } else { Base::multiply_subtract(val, lhs_colid, accum); } KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); @@ -463,12 +462,11 @@ struct SptrsvWrap { auto val = m_obj->vget(i); auto lhs_colid = m_obj->lget(colid); // accum -= val * lhs_colid; - if constexpr(BlockEnabled) { - accum_t temp; - Base::multiply_subtract(val, lhs_colid, temp); - accum += temp; - } - else { + if constexpr (BlockEnabled) { + accum_t temp; + Base::multiply_subtract(val, lhs_colid, temp); + accum += temp; + } else { Base::multiply_subtract(val, lhs_colid, accum); } } else { From 984d804f19f0cd499362244b513b920b7c977e27 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sat, 3 Aug 2024 13:56:25 -0600 Subject: [PATCH 35/41] New impl approach --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 247 ++++++------------ 1 file changed, 83 insertions(+), 164 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index d018174f87..15f5d61381 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -79,7 +79,7 @@ struct SptrsvWrap { using range_type = Kokkos::pair; // Tag structs - struct UnsortedTag {}; + struct UnsortedTag {}; // This doesn't appear to be supported struct LargerCutoffTag {}; struct UnsortedLargerCutoffTag {}; @@ -116,8 +116,6 @@ struct SptrsvWrap { entries_t nodes_grouped_by_level; using reftype = scalar_t &; - using ArrayType = reftype; - using SumArray = reftype; struct SBlock { template @@ -143,16 +141,6 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION size_type get_block_size() const { return 0; } - // multiply_subtract. C -= A * B - KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const scalar_t &a, const scalar_t &b, scalar_t &c) { c -= a * b; } - - KOKKOS_INLINE_FUNCTION - static void copy(const member_type &, scalar_t &, const scalar_t &) {} - - KOKKOS_INLINE_FUNCTION - static void copy(scalar_t &, const scalar_t &) {} - // lget KOKKOS_INLINE_FUNCTION scalar_t &lget(const size_type row) const { return lhs(row); } @@ -207,60 +195,6 @@ struct SptrsvWrap { using reftype = Vector; - struct ArrayType { - scalar_t m_data[MAX_VEC_SIZE]; - - KOKKOS_INLINE_FUNCTION - ArrayType() { init(); } - - KOKKOS_INLINE_FUNCTION - ArrayType(const ArrayType &rhs_) { - for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = rhs_.m_data[i]; - } - - KOKKOS_INLINE_FUNCTION - ArrayType(const Vector &) { init(); } - - KOKKOS_INLINE_FUNCTION - void init() { - for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = 0; - } - - KOKKOS_INLINE_FUNCTION - ArrayType &operator+=(const ArrayType &rhs_) { - for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] += rhs_.m_data[i]; - return *this; - } - }; - - struct SumArray { - using reducer = SumArray; - using value_type = ArrayType; - using result_view_type = Kokkos::View; - - private: - value_type &m_value; - - public: - KOKKOS_INLINE_FUNCTION - SumArray(value_type &value) : m_value(value) {} - - KOKKOS_INLINE_FUNCTION - void join(value_type &dest, const value_type &src) const { dest += src; } - - KOKKOS_INLINE_FUNCTION - void init(value_type &val) const { val.init(); } - - KOKKOS_INLINE_FUNCTION - value_type &reference() const { return m_value; } - - KOKKOS_INLINE_FUNCTION - result_view_type view() const { return result_view_type(&m_value, 1); } - - KOKKOS_INLINE_FUNCTION - bool reference_scalar() const { return true; } - }; - RowMapType row_map; EntriesType entries; ValuesType values; @@ -359,32 +293,6 @@ struct SptrsvWrap { KokkosBatched::Diag::Unit, KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); } - // multiply_subtract. C -= A * B - KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const CBlock &A, const CVector &b, ArrayType &ca) { - Vector c(&ca.m_data[0], b.size()); - multiply_subtract(A, b, c); - } - - KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const CBlock &A, const CVector &b, Vector &c) { - // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 - KokkosBlas::SerialGemv::invoke(-1.0, A, b, 1.0, - c); - } - - KOKKOS_INLINE_FUNCTION - static void copy(const member_type &team, const Vector &lhs_, ArrayType &rhsa) { - CVector rhs_(&rhsa.m_data[0], lhs_.size()); - assign(team, lhs_, rhs_); - } - - KOKKOS_INLINE_FUNCTION - static void copy(const Vector &lhs_, ArrayType &rhsa) { - CVector rhs_(&rhsa.m_data[0], lhs_.size()); - assign(lhs_, rhs_); - } - // lget KOKKOS_INLINE_FUNCTION Vector lget(const size_type row) const { return Vector(lhs.data() + (row * block_size), block_size); } @@ -422,68 +330,60 @@ struct SptrsvWrap { template struct Intermediate : public Common { using Base = Common; - using accum_t = std::conditional_t; Intermediate(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) {} - struct ReduceSumFunctor { + struct ReduceFunctorBasic + { const Base *m_obj; - const lno_t rowid; - lno_t diag; + + ReduceFunctorBasic(const Base* obj, const lno_t=0) : m_obj(obj) {} KOKKOS_INLINE_FUNCTION - void operator()(size_type i, accum_t &accum) const { + static void multiply_subtract(const scalar_t& val, const scalar_t& lhs_col_val, scalar_t& accum) + { + accum -= val * lhs_col_val; + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, scalar_t& accum) const + { const auto colid = m_obj->entries(i); - auto val = m_obj->vget(i); - auto lhs_colid = m_obj->lget(colid); - // accum -= val * lhs_colid; - if constexpr (BlockEnabled) { - accum_t temp; - Base::multiply_subtract(val, lhs_colid, temp); - accum += temp; - } else { - Base::multiply_subtract(val, lhs_colid, accum); - } - KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); + multiply_subtract(m_obj->vget(i), m_obj->lget(colid), accum); } }; - struct ReduceSumDiagFunctor { - const Base *m_obj; - const lno_t rowid; - mutable lno_t diag; + struct ReduceFunctorBlock : public ReduceFunctorBasic + { + using P = ReduceFunctorBasic; + + const size_type block_size; + const size_type b; + + ReduceFunctorBlock(const Base* obj, const size_type block_size_, const size_type b_, const lno_t=0) + : P(obj), block_size(block_size_), b(b_) {} KOKKOS_INLINE_FUNCTION - void operator()(size_type i, accum_t &accum) const { - const auto colid = m_obj->entries(i); - if (colid != rowid) { - auto val = m_obj->vget(i); - auto lhs_colid = m_obj->lget(colid); - // accum -= val * lhs_colid; - if constexpr (BlockEnabled) { - accum_t temp; - Base::multiply_subtract(val, lhs_colid, temp); - accum += temp; - } else { - Base::multiply_subtract(val, lhs_colid, accum); - } - } else { - diag = i; - } + void operator()(size_type i, scalar_t& accum) const + { + const auto idx = i / block_size; + const auto colid = P::m_obj->entries(idx); + P::multiply_subtract(P::m_obj->vget(idx)(b, i % block_size), P::m_obj->lget(colid)(b), accum); } }; + /** + * If we want to support Unsorted, we'll need a Functor that returns the ptr + * of the diag item (colid == rowid). Possibly via multi-reduce? The UnsortedTag + * is defined above but no policies actually use it. + */ + template KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, const int my_rank, const long node_count) const { - using reduce_item_t = typename Base::ArrayType; - using reducer_t = typename Base::SumArray; - using functor_t = std::conditional_t; - - static_assert(!((!IsSerial && BlockEnabled) && UseThreadVec), - "ThreadVectorRanges are not yet supported for block-enabled"); static_assert(!(IsSerial && UseThreadVec), "Requested thread vector range in serial?"); + static_assert(IsSorted, "Unsorted is not yet supported."); const auto rowid = Base::nodes_grouped_by_level(my_rank + node_count); const auto soffset = Base::row_map(rowid); @@ -495,58 +395,77 @@ struct SptrsvWrap { const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0); // We don't need the reducer to find the diag item if sorted - functor_t rf{this, rowid, -1}; typename Base::reftype lhs_val = Base::lget(rowid); - reduce_item_t reduce = lhs_val; + + const auto block_size = BlockEnabled ? Base::get_block_size() : 1; if constexpr (IsSerial) { KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); - for (auto ptr = itr_b; ptr < itr_e; ++ptr) { - rf(ptr, reduce); + if constexpr (BlockEnabled) { + for (size_type b = 0; b < block_size; ++b) { + ReduceFunctorBlock rf(this, Base::block_size, b, rowid); + for (size_type i = itr_b * block_size; i < itr_e * block_size; ++i) { + rf(i, lhs_val(b)); + } + } + } + else { + ReduceFunctorBasic rf(this, rowid); + for (size_type i = itr_b; i < itr_e; ++i) { + rf(i, lhs_val); + } } - Base::copy(lhs_val, reduce); } else { KK_KERNEL_ASSERT_MSG(team != nullptr, "Cannot do team operations without team"); if constexpr (!UseThreadVec) { - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); - team->team_barrier(); - Base::copy(*team, lhs_val, reduce); + if constexpr (BlockEnabled) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(*team, block_size), [&](size_type b) { + ReduceFunctorBlock rf(this, Base::block_size, b, rowid); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size, itr_e * block_size), rf, lhs_val(b)); + }); + } + else { + ReduceFunctorBasic rf(this, rowid); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, lhs_val); + } team->team_barrier(); } else { - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); - Base::copy(lhs_val, reduce); + if constexpr (BlockEnabled) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(*team, block_size), [&](size_type b) { + ReduceFunctorBlock rf(this, Base::block_size, b, rowid); + for (size_type i = itr_b * block_size; i < itr_e * block_size; ++i) { + rf(i, lhs_val(b)); + } + }); + } + else { + ReduceFunctorBasic rf(this, rowid); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); + } } } // If sorted, we already know the diag. Otherwise, get it from the reducer - rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag; + const lno_t diag = IsLower ? eoffset - 1 : soffset; // At end, handle the diag element. We need to be careful to avoid race // conditions here. if constexpr (IsSerial) { // Serial case is easy, there's only 1 thread so just do the // add_and_divide - KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag"); - Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); + KK_KERNEL_ASSERT_MSG(diag != -1, "Serial should always know diag"); + Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag)); } else { - if constexpr (IsSorted) { - // Parallel sorted case is complex. All threads know what the diag is. - // If we have a team sharing the work, we need to ensure only one - // thread performs the add_and_divide (except in BlockEnabled, then - // we can use team operations). - KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag"); - if constexpr (!UseThreadVec) { - Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(rf.diag)); - } else { - Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); - } + // Parallel sorted case is complex. All threads know what the diag is. + // If we have a team sharing the work, we need to ensure only one + // thread performs the add_and_divide (except in BlockEnabled, then + // we can use team operations). + KK_KERNEL_ASSERT_MSG(diag != -1, "Sorted should always know diag"); + if constexpr (!UseThreadVec) { + Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(diag)); } else { - // Parallel unsorted case. Only one thread should know what the diag - // item is. We have that one do the add_and_divide. - if (rf.diag != -1) { - Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); - } + Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag)); } } } From 843f8010966f1b18086863134b65d2c42669e6a6 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 2 Aug 2024 16:47:54 -0600 Subject: [PATCH 36/41] Unset macro --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 15f5d61381..45c20eab72 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2040,4 +2040,6 @@ struct SptrsvWrap { } // namespace Impl } // namespace KokkosSparse +#undef FunctorTypeMacro + #endif From 270a72acc310993779f3c2100e2dc17e057c2128 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sat, 3 Aug 2024 14:07:58 -0600 Subject: [PATCH 37/41] There's no reason to limit team-policy alg to MAX_VEC_SIZE --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 45c20eab72..d777d22290 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -190,8 +190,8 @@ struct SptrsvWrap { using CVector = Kokkos::View>; - static constexpr size_type MAX_VEC_SIZE = 11; - static constexpr size_type BUFF_SIZE = 128; + static constexpr size_type MAX_VEC_SIZE = 16; + static constexpr size_type BUFF_SIZE = 256; using reftype = Vector; @@ -215,7 +215,6 @@ struct SptrsvWrap { block_size(block_size_), block_items(block_size * block_size) { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); - KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, "Max supported block size is " << MAX_VEC_SIZE); } KOKKOS_INLINE_FUNCTION @@ -279,6 +278,9 @@ struct SptrsvWrap { // Need a temp block to do LU of A const auto block_size = b.size(); + KK_KERNEL_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, + "Max supported block size for range-policy is 16. Use team-policy alg if you need more."); + Block LU(&buff[0], block_size, block_size); assign(LU, A); KokkosBatched::SerialLU::invoke(LU); From 48af9302644584394e7193aa90007ab7a5c3385c Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sat, 3 Aug 2024 14:13:56 -0600 Subject: [PATCH 38/41] formatting --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index d777d22290..9f50b5396a 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -79,7 +79,7 @@ struct SptrsvWrap { using range_type = Kokkos::pair; // Tag structs - struct UnsortedTag {}; // This doesn't appear to be supported + struct UnsortedTag {}; // This doesn't appear to be supported struct LargerCutoffTag {}; struct UnsortedLargerCutoffTag {}; @@ -115,7 +115,7 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - using reftype = scalar_t &; + using reftype = scalar_t &; struct SBlock { template @@ -331,45 +331,40 @@ struct SptrsvWrap { */ template struct Intermediate : public Common { - using Base = Common; + using Base = Common; Intermediate(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) {} - struct ReduceFunctorBasic - { + struct ReduceFunctorBasic { const Base *m_obj; - ReduceFunctorBasic(const Base* obj, const lno_t=0) : m_obj(obj) {} + ReduceFunctorBasic(const Base *obj, const lno_t = 0) : m_obj(obj) {} KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const scalar_t& val, const scalar_t& lhs_col_val, scalar_t& accum) - { + static void multiply_subtract(const scalar_t &val, const scalar_t &lhs_col_val, scalar_t &accum) { accum -= val * lhs_col_val; } KOKKOS_INLINE_FUNCTION - void operator()(size_type i, scalar_t& accum) const - { + void operator()(size_type i, scalar_t &accum) const { const auto colid = m_obj->entries(i); multiply_subtract(m_obj->vget(i), m_obj->lget(colid), accum); } }; - struct ReduceFunctorBlock : public ReduceFunctorBasic - { + struct ReduceFunctorBlock : public ReduceFunctorBasic { using P = ReduceFunctorBasic; const size_type block_size; const size_type b; - ReduceFunctorBlock(const Base* obj, const size_type block_size_, const size_type b_, const lno_t=0) - : P(obj), block_size(block_size_), b(b_) {} + ReduceFunctorBlock(const Base *obj, const size_type block_size_, const size_type b_, const lno_t = 0) + : P(obj), block_size(block_size_), b(b_) {} KOKKOS_INLINE_FUNCTION - void operator()(size_type i, scalar_t& accum) const - { + void operator()(size_type i, scalar_t &accum) const { const auto idx = i / block_size; const auto colid = P::m_obj->entries(idx); P::multiply_subtract(P::m_obj->vget(idx)(b, i % block_size), P::m_obj->lget(colid)(b), accum); @@ -411,8 +406,7 @@ struct SptrsvWrap { rf(i, lhs_val(b)); } } - } - else { + } else { ReduceFunctorBasic rf(this, rowid); for (size_type i = itr_b; i < itr_e; ++i) { rf(i, lhs_val); @@ -424,10 +418,10 @@ struct SptrsvWrap { if constexpr (BlockEnabled) { Kokkos::parallel_for(Kokkos::TeamThreadRange(*team, block_size), [&](size_type b) { ReduceFunctorBlock rf(this, Base::block_size, b, rowid); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size, itr_e * block_size), rf, lhs_val(b)); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size, itr_e * block_size), rf, + lhs_val(b)); }); - } - else { + } else { ReduceFunctorBasic rf(this, rowid); Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, lhs_val); } @@ -439,9 +433,8 @@ struct SptrsvWrap { for (size_type i = itr_b * block_size; i < itr_e * block_size; ++i) { rf(i, lhs_val(b)); } - }); - } - else { + }); + } else { ReduceFunctorBasic rf(this, rowid); Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); } From e01f9aff365c5f28fc70a05cc87eca1dbb3166b0 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sat, 3 Aug 2024 14:19:25 -0600 Subject: [PATCH 39/41] Add missing kokkos-inline-funcs --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 9f50b5396a..c0cd276ef5 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -340,6 +340,7 @@ struct SptrsvWrap { struct ReduceFunctorBasic { const Base *m_obj; + KOKKOS_INLINE_FUNCTION ReduceFunctorBasic(const Base *obj, const lno_t = 0) : m_obj(obj) {} KOKKOS_INLINE_FUNCTION @@ -360,6 +361,7 @@ struct SptrsvWrap { const size_type block_size; const size_type b; + KOKKOS_INLINE_FUNCTION ReduceFunctorBlock(const Base *obj, const size_type block_size_, const size_type b_, const lno_t = 0) : P(obj), block_size(block_size_), b(b_) {} From 08434250d33c842e73ee5f8a57bc49fc94fa15f7 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 5 Aug 2024 12:20:45 -0600 Subject: [PATCH 40/41] Fix warnings --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index c0cd276ef5..9e8ac3d5dc 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -396,15 +396,16 @@ struct SptrsvWrap { // We don't need the reducer to find the diag item if sorted typename Base::reftype lhs_val = Base::lget(rowid); - const auto block_size = BlockEnabled ? Base::get_block_size() : 1; + const auto block_size_ = BlockEnabled ? Base::get_block_size() : 1; + (void) block_size_; // Some settings do not use this var if constexpr (IsSerial) { KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); if constexpr (BlockEnabled) { - for (size_type b = 0; b < block_size; ++b) { - ReduceFunctorBlock rf(this, Base::block_size, b, rowid); - for (size_type i = itr_b * block_size; i < itr_e * block_size; ++i) { + for (size_type b = 0; b < block_size_; ++b) { + ReduceFunctorBlock rf(this, block_size_, b, rowid); + for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) { rf(i, lhs_val(b)); } } @@ -418,9 +419,9 @@ struct SptrsvWrap { KK_KERNEL_ASSERT_MSG(team != nullptr, "Cannot do team operations without team"); if constexpr (!UseThreadVec) { if constexpr (BlockEnabled) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(*team, block_size), [&](size_type b) { - ReduceFunctorBlock rf(this, Base::block_size, b, rowid); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size, itr_e * block_size), rf, + Kokkos::parallel_for(Kokkos::TeamThreadRange(*team, block_size_), [&](size_type b) { + ReduceFunctorBlock rf(this, block_size_, b, rowid); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size_, itr_e * block_size_), rf, lhs_val(b)); }); } else { @@ -430,9 +431,9 @@ struct SptrsvWrap { team->team_barrier(); } else { if constexpr (BlockEnabled) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(*team, block_size), [&](size_type b) { - ReduceFunctorBlock rf(this, Base::block_size, b, rowid); - for (size_type i = itr_b * block_size; i < itr_e * block_size; ++i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(*team, block_size_), [&](size_type b) { + ReduceFunctorBlock rf(this, block_size_, b, rowid); + for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) { rf(i, lhs_val(b)); } }); From 364a0ab7d46f481a8c8ab7b1987c439faa1c8ca4 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 5 Aug 2024 12:21:12 -0600 Subject: [PATCH 41/41] format --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 9e8ac3d5dc..0d17fad247 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -397,7 +397,7 @@ struct SptrsvWrap { typename Base::reftype lhs_val = Base::lget(rowid); const auto block_size_ = BlockEnabled ? Base::get_block_size() : 1; - (void) block_size_; // Some settings do not use this var + (void)block_size_; // Some settings do not use this var if constexpr (IsSerial) { KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial");