From c48d35107be2c80ae993d06ee737a7a868c760ff Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 11 Jan 2024 11:09:51 -0700 Subject: [PATCH 01/17] progess --- perf_test/sparse/KokkosSparse_spiluk.cpp | 6 - .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 415 +++++------------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 8 +- sparse/src/KokkosSparse_spiluk_handle.hpp | 17 - sparse/unit_test/Test_Sparse_spiluk.hpp | 103 +++-- 5 files changed, 190 insertions(+), 359 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index 331ae9ec82..c85b126019 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -144,12 +144,6 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, // std::cout << "Create handle" << std::endl; switch (test) { - case LVLSCHED_RP: - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, - EXPAND_FACT * nnz * (fill_lev + 1), - EXPAND_FACT * nnz * (fill_lev + 1)); - kh.get_spiluk_handle()->print_algorithm(); - break; case LVLSCHED_TP1: kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, EXPAND_FACT * nnz * (fill_lev + 1), diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 9484a02c11..beacee1872 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -58,7 +58,7 @@ struct IlukWrap { using team_policy = typename IlukHandle::TeamPolicy; using member_type = typename team_policy::member_type; using range_policy = typename IlukHandle::RangePolicy; - using sview_1d = typename Kokkos::View; + using sview_1d = typename Kokkos::View; static team_policy get_team_policy(const size_type nrows, const int team_size) { @@ -68,6 +68,7 @@ struct IlukWrap { } else { rv = team_policy(nrows, team_size); } + return rv; } @@ -80,17 +81,7 @@ struct IlukWrap { } else { rv = team_policy(exe_space, nrows, team_size); } - return rv; - } - static range_policy get_range_policy(const lno_t start, const lno_t end) { - range_policy rv(start, end); - return rv; - } - - static range_policy get_range_policy(execution_space exe_space, - const lno_t start, const lno_t end) { - range_policy rv(exe_space, start, end); return rv; } @@ -116,9 +107,6 @@ struct IlukWrap { WorkViewType iw; lno_t lev_start; - // unblocked does not require any buffer - static constexpr size_type BUFF_SIZE = 1; - Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, @@ -181,11 +169,10 @@ struct IlukWrap { KOKKOS_INLINE_FUNCTION void add(scalar_t &lhs, const scalar_t &rhs) const { lhs += rhs; } - // multiply: return (alpha * lhs) * rhs + // gemm, alpha is hardcoded to -1, beta hardcoded to 1 KOKKOS_INLINE_FUNCTION - scalar_t multiply(const scalar_t &alpha, const scalar_t &lhs, - const scalar_t &rhs, scalar_t *) const { - return alpha * lhs * rhs; + void gemm(const scalar_t &A, const scalar_t &B, scalar_t &C) const { + C += -1*A*B; } // lget @@ -230,9 +217,6 @@ struct IlukWrap { size_type block_items; sview_1d ones; - // blocked requires a buffer to store gemm output - static constexpr size_type BUFF_SIZE = 128; - using LValuesUnmanaged2DBlockType = Kokkos::View< typename LValuesType::value_type **, typename KokkosKernels::Impl::GetUnifiedLayout< @@ -279,7 +263,6 @@ struct IlukWrap { Kokkos::deep_copy(ones, 1.0); KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); - KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); } // lset @@ -356,20 +339,18 @@ struct IlukWrap { KokkosBatched::SerialAxpy::invoke(ones, rhs, lhs); } - // multiply: return (alpha * lhs) * rhs + // gemm, alpha is hardcoded to -1, beta hardcoded to 1 + template KOKKOS_INLINE_FUNCTION - LValuesUnmanaged2DBlockType multiply(const scalar_t &alpha, - const UValuesUnmanaged2DBlockType &lhs, - const LValuesUnmanaged2DBlockType &rhs, - scalar_t *buff) const { - LValuesUnmanaged2DBlockType result(&buff[0], block_size, block_size); + void gemm(const UValuesUnmanaged2DBlockType &A, + const LValuesUnmanaged2DBlockType &B, + CView& C) const { KokkosBatched::SerialGemm:: - invoke( - alpha, lhs, rhs, 0.0, result); - return result; + invoke( + -1.0, A, B, 1.0, C); } // lget @@ -408,103 +389,6 @@ struct IlukWrap { } }; - template - struct ILUKLvlSchedRPNumericFunctor - : public Common { - using Base = Common; - - ILUKLvlSchedRPNumericFunctor( - const ARowMapType &A_row_map_, const AEntriesType &A_entries_, - const AValuesType &A_values_, const LRowMapType &L_row_map_, - const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - WorkViewType &iw_, const lno_t &lev_start_, - const size_type &block_size_ = 0) - : Base(A_row_map_, A_entries_, A_values_, L_row_map_, L_entries_, - L_values_, U_row_map_, U_entries_, U_values_, level_idx_, iw_, - lev_start_, block_size_) {} - - KOKKOS_FUNCTION - void operator()(const lno_t i) const { - scalar_t buff[Base::BUFF_SIZE]; - - const auto rowid = Base::level_idx(i); - const auto tid = i - Base::lev_start; - auto k1 = Base::L_row_map(rowid); - auto k2 = Base::L_row_map(rowid + 1) - 1; - Base::lset_id(k2); - for (auto k = k1; k < k2; ++k) { - const auto col = Base::L_entries(k); - Base::lset(k, 0.0); - Base::iw(tid, col) = k; - } - - k1 = Base::U_row_map(rowid); - k2 = Base::U_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) { - const auto col = Base::U_entries(k); - Base::uset(k, 0.0); - Base::iw(tid, col) = k; - } - - k1 = Base::A_row_map(rowid); - k2 = Base::A_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) { - const auto col = Base::A_entries(k); - const auto ipos = Base::iw(tid, col); - if (col < rowid) { - Base::lset(ipos, Base::aget(k)); - } else { - Base::uset(ipos, Base::aget(k)); - } - } - - // Eliminate prev rows - k1 = Base::L_row_map(rowid); - k2 = Base::L_row_map(rowid + 1) - 1; - for (auto k = k1; k < k2; ++k) { - const auto prev_row = Base::L_entries(k); - const auto u_diag = Base::uget(Base::U_row_map(prev_row)); - Base::divide(Base::lget(k), u_diag); - auto fact = Base::lget(k); - for (auto kk = Base::U_row_map(prev_row) + 1; - kk < Base::U_row_map(prev_row + 1); ++kk) { - const auto col = Base::U_entries(kk); - const auto ipos = Base::iw(tid, col); - if (ipos == -1) continue; - const auto lxu = Base::multiply(-1.0, Base::uget(kk), fact, &buff[0]); - if (col < rowid) { - Base::add(Base::lget(ipos), lxu); - } else { - Base::add(Base::uget(ipos), lxu); - } - } // end for kk - } // end for k - - const auto ipos = Base::iw(tid, rowid); - if (Base::uequal(ipos, 0.0)) { - Base::uset(ipos, 1e6); - } - - // Reset - k1 = Base::L_row_map(rowid); - k2 = Base::L_row_map(rowid + 1) - 1; - for (auto k = k1; k < k2; ++k) Base::iw(tid, Base::L_entries(k)) = -1; - - k1 = Base::U_row_map(rowid); - k2 = Base::U_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) Base::iw(tid, Base::U_entries(k)) = -1; - } - }; - template + (lev_end - lev_start)) + lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; + else + lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + + team_policy tpolicy = + get_team_policy(lvl_nrows_chunk, team_size); KernelLaunchMacro(A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, U_values, - rpolicy, "parfor_fixed_lvl", level_idx, iw, - lev_start, RPF, RPB, thandle.is_block_enabled(), - block_size); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm:: - SEQLVLSCHD_TP1) { - lno_t lvl_rowid_start = 0; - lno_t lvl_nrows_chunk; - for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { - if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > - (lev_end - lev_start)) - lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; - else - lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - - team_policy tpolicy = get_team_policy(lvl_nrows_chunk, team_size); - KernelLaunchMacro(A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, - U_values, tpolicy, "parfor_tp1", level_idx, iw, - lev_start + lvl_rowid_start, TPF, TPB, - thandle.is_block_enabled(), block_size); - Kokkos::fence(); - lvl_rowid_start += lvl_nrows_chunk; - } + L_entries, L_values, U_row_map, U_entries, + U_values, tpolicy, "parfor_tp1", level_idx, iw, + lev_start + lvl_rowid_start, TPF, TPB, + thandle.is_block_enabled(), block_size); + Kokkos::fence(); + lvl_rowid_start += lvl_nrows_chunk; } } // end if } // end for lvl - //} // Output check #ifdef NUMERIC_OUTPUT_INFO @@ -781,8 +644,6 @@ struct IlukWrap { const std::vector &U_row_map_v, const std::vector &U_entries_v, std::vector &U_values_v) { - using RPF = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, false); - using RPB = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, true); using TPF = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, false); using TPB = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, true); @@ -811,110 +672,70 @@ struct IlukWrap { if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; } - // Assume all streams use the same algorithm - if (thandle_v[0]->get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // Initial work across streams at each level - for (int i = 0; i < nstreams; i++) { - // Only do this if this stream has this level - if (lvl < nlevels_v[i]) { - lvl_start_v[i] = lvl_ptr_h_v[i](lvl); - lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); - if ((lvl_end_v[i] - lvl_start_v[i]) != 0) - stream_have_level_v[i] = true; - else - stream_have_level_v[i] = false; - } else - stream_have_level_v[i] = false; - } + std::vector lvl_nchunks_h_v(nstreams); + std::vector lvl_nrowsperchunk_h_v(nstreams); + std::vector lvl_rowid_start_v(nstreams); + std::vector team_size_v(nstreams); - // Main work of the level across streams - // 1. Launch work on all streams - for (int i = 0; i < nstreams; i++) { - // Launch only if stream i-th has this level - if (stream_have_level_v[i]) { - range_policy rpolicy = - get_range_policy(execspace_v[i], lvl_start_v[i], lvl_end_v[i]); - KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], - rpolicy, "parfor_rp", lvl_idx_v[i], iw_v[i], - lvl_start_v[i], RPF, RPB, is_block_enabled_v[i], - block_size_v[i]); - } // end if (stream_have_level_v[i]) - } // end for streams - } // end for lvl - } // end SEQLVLSCHD_RP - else if (thandle_v[0]->get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - std::vector lvl_nchunks_h_v(nstreams); - std::vector lvl_nrowsperchunk_h_v(nstreams); - std::vector lvl_rowid_start_v(nstreams); - std::vector team_size_v(nstreams); + for (int i = 0; i < nstreams; i++) { + lvl_nchunks_h_v[i] = thandle_v[i]->get_level_nchunks(); + lvl_nrowsperchunk_h_v[i] = thandle_v[i]->get_level_nrowsperchunk(); + team_size_v[i] = thandle_v[i]->get_team_size(); + } + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // Initial work across streams at each level + lno_t lvl_nchunks_max = 0; for (int i = 0; i < nstreams; i++) { - lvl_nchunks_h_v[i] = thandle_v[i]->get_level_nchunks(); - lvl_nrowsperchunk_h_v[i] = thandle_v[i]->get_level_nrowsperchunk(); - team_size_v[i] = thandle_v[i]->get_team_size(); - } - - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // Initial work across streams at each level - lno_t lvl_nchunks_max = 0; - for (int i = 0; i < nstreams; i++) { - // Only do this if this stream has this level - if (lvl < nlevels_v[i]) { - lvl_start_v[i] = lvl_ptr_h_v[i](lvl); - lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); - if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { - stream_have_level_v[i] = true; - lvl_rowid_start_v[i] = 0; - if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) - lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); - } else - stream_have_level_v[i] = false; + // Only do this if this stream has this level + if (lvl < nlevels_v[i]) { + lvl_start_v[i] = lvl_ptr_h_v[i](lvl); + lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); + if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { + stream_have_level_v[i] = true; + lvl_rowid_start_v[i] = 0; + if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) + lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); } else stream_have_level_v[i] = false; - } - - // Main work of the level across streams -- looping through chunnks - for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { - // 1. Launch work on all streams (for each chunk) - for (int i = 0; i < nstreams; i++) { - // Launch only if stream i-th has this level - if (stream_have_level_v[i]) { - // Launch only if stream i-th has this chunk - if (chunkid < lvl_nchunks_h_v[i](lvl)) { - // 1.a. Specify number of rows (i.e. number of teams) to launch - lno_t lvl_nrows_chunk = 0; - if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > - (lvl_end_v[i] - lvl_start_v[i])) - lvl_nrows_chunk = - (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; - else - lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); - - // 1.b. Create functor for stream i-th and launch - team_policy tpolicy = get_team_policy( - execspace_v[i], lvl_nrows_chunk, team_size_v[i]); - KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], - tpolicy, "parfor_tp1", lvl_idx_v[i], iw_v[i], - lvl_start_v[i] + lvl_rowid_start_v[i], TPF, - TPB, is_block_enabled_v[i], block_size_v[i]); - // 1.c. Ready to move to next chunk - lvl_rowid_start_v[i] += lvl_nrows_chunk; - } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) - } // end if (stream_have_level_v[i]) - } // end for streams - } // end for chunkid - } // end for lvl - } // end SEQLVLSCHD_TP1 + } else + stream_have_level_v[i] = false; + } + // Main work of the level across streams -- looping through chunnks + for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { + // 1. Launch work on all streams (for each chunk) + for (int i = 0; i < nstreams; i++) { + // Launch only if stream i-th has this level + if (stream_have_level_v[i]) { + // Launch only if stream i-th has this chunk + if (chunkid < lvl_nchunks_h_v[i](lvl)) { + // 1.a. Specify number of rows (i.e. number of teams) to launch + lno_t lvl_nrows_chunk = 0; + if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > + (lvl_end_v[i] - lvl_start_v[i])) + lvl_nrows_chunk = + (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; + else + lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); + + // 1.b. Create functor for stream i-th and launch + team_policy tpolicy = get_team_policy( + execspace_v[i], lvl_nrows_chunk, team_size_v[i]); + KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + tpolicy, "parfor_tp1", lvl_idx_v[i], iw_v[i], + lvl_start_v[i] + lvl_rowid_start_v[i], TPF, + TPB, is_block_enabled_v[i], block_size_v[i]); + // 1.c. Ready to move to next chunk + lvl_rowid_start_v[i] += lvl_nrows_chunk; + } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) + } // end if (stream_have_level_v[i]) + } // end for streams + } // end for chunkid + } // end for lvl } // end iluk_numeric_streams }; // IlukWrap diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 9e40c23af7..31c2494cdd 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -229,9 +229,7 @@ void iluk_symbolic(IlukHandle& thandle, LEntriesType& L_entries_d, URowMapType& U_row_map_d, UEntriesType& U_entries_d, int nstreams = 1) { if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP || - thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) /* || thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHED_TP2 )*/ { @@ -379,7 +377,7 @@ void iluk_symbolic(IlukHandle& thandle, std::ostringstream os; os << "KokkosSparse::Experimental::spiluk_symbolic: U_entries's extent " "must be larger than " - << U_entries_d.extent(0); + << U_entries_d.extent(0) << ", must be at least " << cntU + lenu + 1; KokkosKernels::Impl::throw_runtime_exception(os.str()); } // U diag entry @@ -401,7 +399,7 @@ void iluk_symbolic(IlukHandle& thandle, std::ostringstream os; os << "KokkosSparse::Experimental::spiluk_symbolic: L_entries's extent " "must be larger than " - << L_entries_d.extent(0); + << L_entries_d.extent(0) << ", must be at least " << cntL + lenl + 1; KokkosKernels::Impl::throw_runtime_exception(os.str()); } for (size_type k = 0; k < lenl; ++k) { diff --git a/sparse/src/KokkosSparse_spiluk_handle.hpp b/sparse/src/KokkosSparse_spiluk_handle.hpp index 2b37d08f6e..952a14aa2d 100644 --- a/sparse/src/KokkosSparse_spiluk_handle.hpp +++ b/sparse/src/KokkosSparse_spiluk_handle.hpp @@ -29,7 +29,6 @@ namespace Experimental { // TP2 algorithm has issues with some offset-ordinal combo to be addressed enum class SPILUKAlgorithm { - SEQLVLSCHD_RP, SEQLVLSCHD_TP1 /*, SEQLVLSCHED_TP2*/ }; @@ -256,9 +255,6 @@ class SPILUKHandle { int get_vector_size() const { return this->vector_size; } void print_algorithm() { - if (algm == SPILUKAlgorithm::SEQLVLSCHD_RP) - std::cout << "SEQLVLSCHD_RP" << std::endl; - if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1) std::cout << "SEQLVLSCHD_TP1" << std::endl; @@ -269,19 +265,6 @@ class SPILUKHandle { } */ } - - inline SPILUKAlgorithm StringToSPILUKAlgorithm(std::string &name) { - if (name == "SPILUK_DEFAULT") - return SPILUKAlgorithm::SEQLVLSCHD_RP; - else if (name == "SPILUK_RANGEPOLICY") - return SPILUKAlgorithm::SEQLVLSCHD_RP; - else if (name == "SPILUK_TEAMPOLICY1") - return SPILUKAlgorithm::SEQLVLSCHD_TP1; - /*else if(name=="SPILUK_TEAMPOLICY2") return - * SPILUKAlgorithm::SEQLVLSCHED_TP2;*/ - else - throw std::runtime_error("Invalid SPILUKAlgorithm name"); - } }; } // namespace Experimental diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 7d52d08ee6..ee227043e6 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -120,7 +120,8 @@ struct SpilukTest { } static void check_result(const RowMapType& row_map, - const EntriesType& entries, const ValuesType& values, + const EntriesType& entries, + const ValuesType& values, const RowMapType& L_row_map, const EntriesType& L_entries, const ValuesType& L_values, @@ -156,6 +157,7 @@ struct SpilukTest { U_entries, block_size); const auto result = check_result_impl(A, L, U, nrows, block_size); + EXPECT_LT(result, 1e0); } @@ -165,7 +167,7 @@ struct SpilukTest { const EntriesType& entries, const ValuesType& values, SPILUKAlgorithm alg, const lno_t fill_lev) { const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 4 * nrows, 4 * nrows); + kh.create_spiluk_handle(alg, nrows, 20 * nrows, 20 * nrows); auto spiluk_handle = kh.get_spiluk_handle(); @@ -195,21 +197,6 @@ struct SpilukTest { kh.destroy_spiluk_handle(); - // For team policy alg, check results against range policy - if (alg == SPILUKAlgorithm::SEQLVLSCHD_TP1) { - const auto [L_row_map_rp, L_entries_rp, L_values_rp, U_row_map_rp, - U_entries_rp, U_values_rp] = - run_and_check_spiluk(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev); - - EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(L_values, L_values_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(U_values, U_values_rp, EPS); - } - return std::make_tuple(L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); } @@ -220,7 +207,7 @@ struct SpilukTest { const size_type block_size) { const size_type block_items = block_size * block_size; const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 4 * nrows, 4 * nrows, block_size); + kh.create_spiluk_handle(alg, nrows, 20 * nrows, 20 * nrows, block_size); auto spiluk_handle = kh.get_spiluk_handle(); @@ -279,11 +266,70 @@ struct SpilukTest { KernelHandle kh; run_and_check_spiluk(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev); + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + } + + static void run_test_spiluk_scale() { + + // Create a diagonally dominant sparse matrix to test: + constexpr auto nrows = 5000; + constexpr auto diagDominance = 2; + + size_type nnz = 10 * nrows; + auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + Crs>(nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + // Pull out views from CRS + RowMapType row_map("row_map", A.graph.row_map.extent(0)); + EntriesType entries("entries", A.graph.entries.extent(0)); + ValuesType values("values", A.values.extent(0)); + Kokkos::deep_copy(row_map, A.graph.row_map); + Kokkos::deep_copy(entries, A.graph.entries); + Kokkos::deep_copy(values, A.values); + + const lno_t fill_lev = 2; + + KernelHandle kh; + run_and_check_spiluk(kh, row_map, entries, values, SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); } + static void run_test_spiluk_scale_blocks() { + + // Create a diagonally dominant sparse matrix to test: + constexpr auto nrows = 5000; + constexpr auto diagDominance = 2; + + RowMapType brow_map; + EntriesType bentries; + ValuesType bvalues; + + const size_type block_size = 10; + + size_type nnz = 10 * nrows; + auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + Crs>(nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + // Pull out views from CRS + Bsr bsr(A, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + const lno_t fill_lev = 2; + + KernelHandle kh; + + run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, block_size); + } + static void run_test_spiluk_streams(SPILUKAlgorithm test_algo, int nstreams) { // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition @@ -539,10 +585,8 @@ struct SpilukTest { KernelHandle kh; // Check block_size=1 produces identical result to unblocked - run_and_check_spiluk_block(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev, 1); - run_and_check_spiluk_block(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, 1); + // run_and_check_spiluk_block(kh, row_map, entries, values, + // SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, 1); // Convert to BSR Crs crs("crs for block spiluk test", nrows, nrows, nnz, values, row_map, @@ -557,9 +601,6 @@ struct SpilukTest { Kokkos::deep_copy(bentries, bsr.graph.entries); Kokkos::deep_copy(bvalues, bsr.values); - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev, - block_size); run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, block_size); @@ -574,6 +615,8 @@ void test_spiluk() { using TestStruct = Test::SpilukTest; TestStruct::run_test_spiluk(); TestStruct::run_test_spiluk_blocks(); + TestStruct::run_test_spiluk_scale(); + TestStruct::run_test_spiluk_scale_blocks(); } template ; - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 1); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 2); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 3); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 4); TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 2); TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 3); TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 4); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 1); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 2); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 3); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 4); TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, From 50b6dde2afe61056a2bab583e981b02f83fcce8d Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 11 Jan 2024 11:44:00 -0700 Subject: [PATCH 02/17] Fix for gemm --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index beacee1872..2b483bda2c 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -107,6 +107,8 @@ struct IlukWrap { WorkViewType iw; lno_t lev_start; + using reftype = scalar_t&; + Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, @@ -238,6 +240,8 @@ struct IlukWrap { typename AValuesType::device_type, Kokkos::MemoryTraits >; + using reftype = LValuesUnmanaged2DBlockType; + Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, @@ -472,7 +476,7 @@ struct IlukWrap { const auto col = Base::U_entries(kk); const auto ipos = Base::iw(my_team, col); if (ipos != -1) { - auto C = col < rowid ? Base::lget(ipos) : Base::uget(ipos); + typename Base::reftype C = col < rowid ? Base::lget(ipos) : Base::uget(ipos); Base::gemm(Base::uget(kk), fact, C); } }); // end for kk From 869f6ea2f0a1c4076c30298ee25d2d7a5a53f86b Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 11 Jan 2024 12:47:53 -0700 Subject: [PATCH 03/17] formatting --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 85 +++++++++---------- sparse/unit_test/Test_Sparse_spiluk.hpp | 18 ++-- 2 files changed, 51 insertions(+), 52 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 2b483bda2c..d52e5cbc97 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -58,7 +58,7 @@ struct IlukWrap { using team_policy = typename IlukHandle::TeamPolicy; using member_type = typename team_policy::member_type; using range_policy = typename IlukHandle::RangePolicy; - using sview_1d = typename Kokkos::View; + using sview_1d = typename Kokkos::View; static team_policy get_team_policy(const size_type nrows, const int team_size) { @@ -107,7 +107,7 @@ struct IlukWrap { WorkViewType iw; lno_t lev_start; - using reftype = scalar_t&; + using reftype = scalar_t &; Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, @@ -174,7 +174,7 @@ struct IlukWrap { // gemm, alpha is hardcoded to -1, beta hardcoded to 1 KOKKOS_INLINE_FUNCTION void gemm(const scalar_t &A, const scalar_t &B, scalar_t &C) const { - C += -1*A*B; + C += -1 * A * B; } // lget @@ -345,16 +345,15 @@ struct IlukWrap { // gemm, alpha is hardcoded to -1, beta hardcoded to 1 template - KOKKOS_INLINE_FUNCTION - void gemm(const UValuesUnmanaged2DBlockType &A, - const LValuesUnmanaged2DBlockType &B, - CView& C) const { + KOKKOS_INLINE_FUNCTION void gemm(const UValuesUnmanaged2DBlockType &A, + const LValuesUnmanaged2DBlockType &B, + CView &C) const { KokkosBatched::SerialGemm:: - invoke( - -1.0, A, B, 1.0, C); + invoke( + -1.0, A, B, 1.0, C); } // lget @@ -427,10 +426,10 @@ struct IlukWrap { Base::lset_id(team, k2); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::L_entries(k); - Base::lset(k, 0.0); - Base::iw(my_team, col) = k; - }); + const auto col = Base::L_entries(k); + Base::lset(k, 0.0); + Base::iw(my_team, col) = k; + }); team.team_barrier(); @@ -438,10 +437,10 @@ struct IlukWrap { k2 = Base::U_row_map(rowid + 1); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::U_entries(k); - Base::uset(k, 0.0); - Base::iw(my_team, col) = k; - }); + const auto col = Base::U_entries(k); + Base::uset(k, 0.0); + Base::iw(my_team, col) = k; + }); team.team_barrier(); @@ -450,14 +449,14 @@ struct IlukWrap { k2 = Base::A_row_map(rowid + 1); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::A_entries(k); - const auto ipos = Base::iw(my_team, col); - if (col < rowid) { - Base::lset(ipos, Base::aget(k)); - } else { - Base::uset(ipos, Base::aget(k)); - } - }); + const auto col = Base::A_entries(k); + const auto ipos = Base::iw(my_team, col); + if (col < rowid) { + Base::lset(ipos, Base::aget(k)); + } else { + Base::uset(ipos, Base::aget(k)); + } + }); team.team_barrier(); @@ -476,7 +475,8 @@ struct IlukWrap { const auto col = Base::U_entries(kk); const auto ipos = Base::iw(my_team, col); if (ipos != -1) { - typename Base::reftype C = col < rowid ? Base::lget(ipos) : Base::uget(ipos); + typename Base::reftype C = + col < rowid ? Base::lget(ipos) : Base::uget(ipos); Base::gemm(Base::uget(kk), fact, C); } }); // end for kk @@ -498,17 +498,17 @@ struct IlukWrap { k2 = Base::L_row_map(rowid + 1) - 1; Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::L_entries(k); - Base::iw(my_team, col) = -1; - }); + const auto col = Base::L_entries(k); + Base::iw(my_team, col) = -1; + }); k1 = Base::U_row_map(rowid); k2 = Base::U_row_map(rowid + 1); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::U_entries(k); - Base::iw(my_team, col) = -1; - }); + const auto col = Base::U_entries(k); + Base::iw(my_team, col) = -1; + }); } }; @@ -556,7 +556,7 @@ struct IlukWrap { level_nchunks_h = thandle.get_level_nchunks(); level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); - iw = thandle.get_iw(); + iw = thandle.get_iw(); // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead @@ -574,11 +574,10 @@ struct IlukWrap { else lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - team_policy tpolicy = - get_team_policy(lvl_nrows_chunk, team_size); + team_policy tpolicy = get_team_policy(lvl_nrows_chunk, team_size); KernelLaunchMacro(A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, - U_values, tpolicy, "parfor_tp1", level_idx, iw, + L_entries, L_values, U_row_map, U_entries, U_values, + tpolicy, "parfor_tp1", level_idx, iw, lev_start + lvl_rowid_start, TPF, TPB, thandle.is_block_enabled(), block_size); Kokkos::fence(); @@ -720,19 +719,19 @@ struct IlukWrap { if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > (lvl_end_v[i] - lvl_start_v[i])) lvl_nrows_chunk = - (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; + (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; else lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); // 1.b. Create functor for stream i-th and launch team_policy tpolicy = get_team_policy( - execspace_v[i], lvl_nrows_chunk, team_size_v[i]); + execspace_v[i], lvl_nrows_chunk, team_size_v[i]); KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], U_row_map_v[i], U_entries_v[i], U_values_v[i], tpolicy, "parfor_tp1", lvl_idx_v[i], iw_v[i], - lvl_start_v[i] + lvl_rowid_start_v[i], TPF, - TPB, is_block_enabled_v[i], block_size_v[i]); + lvl_start_v[i] + lvl_rowid_start_v[i], TPF, TPB, + is_block_enabled_v[i], block_size_v[i]); // 1.c. Ready to move to next chunk lvl_rowid_start_v[i] += lvl_nrows_chunk; } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) @@ -740,7 +739,7 @@ struct IlukWrap { } // end for streams } // end for chunkid } // end for lvl - } // end iluk_numeric_streams + } // end iluk_numeric_streams }; // IlukWrap diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index ee227043e6..625dda2da9 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -120,8 +120,7 @@ struct SpilukTest { } static void check_result(const RowMapType& row_map, - const EntriesType& entries, - const ValuesType& values, + const EntriesType& entries, const ValuesType& values, const RowMapType& L_row_map, const EntriesType& L_entries, const ValuesType& L_values, @@ -270,14 +269,14 @@ struct SpilukTest { } static void run_test_spiluk_scale() { - // Create a diagonally dominant sparse matrix to test: constexpr auto nrows = 5000; constexpr auto diagDominance = 2; size_type nnz = 10 * nrows; - auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< - Crs>(nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + auto A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); // Pull out views from CRS RowMapType row_map("row_map", A.graph.row_map.extent(0)); @@ -296,7 +295,6 @@ struct SpilukTest { } static void run_test_spiluk_scale_blocks() { - // Create a diagonally dominant sparse matrix to test: constexpr auto nrows = 5000; constexpr auto diagDominance = 2; @@ -308,8 +306,9 @@ struct SpilukTest { const size_type block_size = 10; size_type nnz = 10 * nrows; - auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< - Crs>(nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + auto A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); // Pull out views from CRS Bsr bsr(A, block_size); @@ -327,7 +326,8 @@ struct SpilukTest { KernelHandle kh; run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, block_size); + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); } static void run_test_spiluk_streams(SPILUKAlgorithm test_algo, int nstreams) { From 2ebf31fb1d5f24dcadb3e1747cac4f06a32ca343 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 11 Jan 2024 13:44:56 -0700 Subject: [PATCH 04/17] Remove unused divide method --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index d52e5cbc97..b43a39700e 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -157,9 +157,6 @@ struct IlukWrap { } // divide. lhs /= rhs - KOKKOS_INLINE_FUNCTION - void divide(scalar_t &lhs, const scalar_t &rhs) const { lhs /= rhs; } - KOKKOS_INLINE_FUNCTION void divide(const member_type &team, scalar_t &lhs, const scalar_t &rhs) const { @@ -315,17 +312,6 @@ struct IlukWrap { } // divide. lhs /= rhs - KOKKOS_INLINE_FUNCTION - void divide(const LValuesUnmanaged2DBlockType &lhs, - const UValuesUnmanaged2DBlockType &rhs) const { - KokkosBatched::SerialTrsm< - KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, // not 100% on this - KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>:: // not 100% on this - invoke(1.0, rhs, lhs); - } - KOKKOS_INLINE_FUNCTION void divide(const member_type &team, const LValuesUnmanaged2DBlockType &lhs, const UValuesUnmanaged2DBlockType &rhs) const { From 379387504d31061b78ac511d34ad17f41f62aed8 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 11 Jan 2024 16:06:09 -0700 Subject: [PATCH 05/17] Fix warning --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index b43a39700e..bbd2343cd3 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -565,7 +565,7 @@ struct IlukWrap { L_entries, L_values, U_row_map, U_entries, U_values, tpolicy, "parfor_tp1", level_idx, iw, lev_start + lvl_rowid_start, TPF, TPB, - thandle.is_block_enabled(), block_size); + block_enabled, block_size); Kokkos::fence(); lvl_rowid_start += lvl_nrows_chunk; } From 622f234c7fa5a2762bd27eb4984e45e0e027acc5 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 22 Jan 2024 16:08:13 -0700 Subject: [PATCH 06/17] Enhancements to spiluk test --- sparse/unit_test/Test_Sparse_spiluk.hpp | 198 ++++++++++++++++++------ 1 file changed, 152 insertions(+), 46 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 625dda2da9..41e4987176 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -41,6 +41,9 @@ using namespace KokkosKernels::Experimental; using kokkos_complex_double = Kokkos::complex; using kokkos_complex_float = Kokkos::complex; +// Comment this out to do focussed debugging +#define TEST_SPILUK_FULL_CHECKS + namespace Test { template @@ -82,6 +85,7 @@ struct SpilukTest { using ValuesType_hostmirror = typename ValuesType::HostMirror; using execution_space = typename device::execution_space; using memory_space = typename device::memory_space; + using range_policy = Kokkos::RangePolicy; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename device::execution_space, @@ -119,6 +123,47 @@ struct SpilukTest { return diff_nrm / bb_nrm; } + static bool is_triangular(const RowMapType& drow_map, const EntriesType& dentries, const ValuesType& dvalues, bool check_lower, const size_type block_size = 1) + { + const auto nrows = row_map.extent(0) - 1; + const auto block_items = block_size * block_size; + + auto row_map = Kokkos::create_mirror_view(drow_map); + auto entries = Kokkos::create_mirror_view(dentries); + auto values = Kokkos::create_mirror_view(dvalues); + Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(values, dvalues); + + for (size_type row = 0; row < nrows; ++row) { + const auto row_nnz_begin = row_map(row); + const auto row_nnz_end = row_map(row+1); + for (size_type nnz = row_nnz_begin; nnz < row_nnz_end; ++nnz) { + const auto col = entries(nnz); + if (col > row && check_lower) { + return false; + } + else if (col < row && !check_lower) { + return false; + } + else if (col == row && block_size > 1) { + // Check diagonal block + scalar_t* block = values.data() + nnz * block_items; + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + if ( (j > i && check_lower && block[i*block_size + j] != 0.0 ) || + (j < i && !check_lower && block[i*block_size + j] != 0.0) ) { + std::cout << "Bad block entry is: " << block[i*block_size + j] << std::endl; + return false; + } + } + } + } + } + } + return true; + } + static void check_result(const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const RowMapType& L_row_map, @@ -135,7 +180,15 @@ struct SpilukTest { Crs U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, U_entries); + EXPECT_TRUE(is_triangular(L_row_map, L_entries, L_values, true)); + EXPECT_TRUE(is_triangular(U_row_map, U_entries, U_values, false)); + const auto result = check_result_impl(A, L, U, nrows); + // std::cout << "For nrows=" << nrows << ", unblocked had residual: " << result << std::endl; + // std::cout << "L" << std::endl; + // print_matrix(decompress_matrix(L_row_map, L_entries, L_values)); + // std::cout << "U" << std::endl; + // print_matrix(decompress_matrix(U_row_map, U_entries, U_values)); EXPECT_LT(result, 1e-4); } @@ -155,20 +208,31 @@ struct SpilukTest { Bsr U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, U_entries, block_size); + EXPECT_TRUE(is_triangular(L_row_map, L_entries, L_values, true, block_size)); + EXPECT_TRUE(is_triangular(U_row_map, U_entries, U_values, false, block_size)); + const auto result = check_result_impl(A, L, U, nrows, block_size); + // std::cout << "For nrows=" << nrows << ", block_size=" << block_size << " had residual: " << result << std::endl; + // std::cout << "L" << std::endl; + // print_matrix(decompress_matrix(L_row_map, L_entries, L_values, block_size)); + // std::cout << "U" << std::endl; + // print_matrix(decompress_matrix(U_row_map, U_entries, U_values, block_size)); - EXPECT_LT(result, 1e0); + EXPECT_LT(result, 1e-2); } static std::tuple run_and_check_spiluk(KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, - SPILUKAlgorithm alg, const lno_t fill_lev) { + SPILUKAlgorithm alg, const lno_t fill_lev, const int team_size = -1) { const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 20 * nrows, 20 * nrows); + kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows); auto spiluk_handle = kh.get_spiluk_handle(); + if (team_size != -1) { + spiluk_handle->set_team_size(team_size); + } // Allocate L and U as outputs RowMapType L_row_map("L_row_map", nrows + 1); @@ -196,19 +260,40 @@ struct SpilukTest { kh.destroy_spiluk_handle(); +#ifdef TEST_SPILUK_FULL_CHECKS + // Check that team size = 1 produces same result + if (team_size != 1) { + const auto [L_row_map_ts1, L_entries_ts1, L_values_ts1, U_row_map_ts1, + U_entries_ts1, U_values_ts1] = + run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev, 1); + + EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(L_values, L_values_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_values, U_values_ts1, EPS); + } +#endif + return std::make_tuple(L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); } - static void run_and_check_spiluk_block( + static std::tuple + run_and_check_spiluk_block( KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, SPILUKAlgorithm alg, const lno_t fill_lev, - const size_type block_size) { + const size_type block_size, const int team_size = -1) { const size_type block_items = block_size * block_size; const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 20 * nrows, 20 * nrows, block_size); + kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows, block_size); auto spiluk_handle = kh.get_spiluk_handle(); + if (team_size != -1) { + spiluk_handle->set_team_size(team_size); + } // Allocate L and U as outputs RowMapType L_row_map("L_row_map", nrows + 1); @@ -236,11 +321,12 @@ struct SpilukTest { kh.destroy_spiluk_handle(); +#ifdef TEST_SPILUK_FULL_CHECKS // If block_size is 1, results should exactly match unblocked results if (block_size == 1) { const auto [L_row_map_u, L_entries_u, L_values_u, U_row_map_u, U_entries_u, U_values_u] = - run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev); + run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev, team_size); EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_u, EPS); EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_u, EPS); @@ -249,6 +335,24 @@ struct SpilukTest { EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_u, EPS); EXPECT_NEAR_KK_1DVIEW(U_values, U_values_u, EPS); } + + // Check that team size = 1 produces same result + if (team_size != 1) { + const auto [L_row_map_ts1, L_entries_ts1, L_values_ts1, U_row_map_ts1, + U_entries_ts1, U_values_ts1] = + run_and_check_spiluk_block(kh, row_map, entries, values, alg, fill_lev, block_size, 1); + + EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(L_values, L_values_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_values, U_values_ts1, EPS); + } +#endif + + return std::make_tuple(L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values); } static void run_test_spiluk() { @@ -268,6 +372,47 @@ struct SpilukTest { SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); } + static void run_test_spiluk_blocks() { + std::vector> A = get_9x9_fixture(); + + RowMapType row_map, brow_map; + EntriesType entries, bentries; + ValuesType values, bvalues; + + compress_matrix(row_map, entries, values, A); + + const size_type nrows = A.size(); + const size_type nnz = values.extent(0); + const lno_t fill_lev = 2; + const size_type block_size = nrows % 2 == 0 ? 2 : 3; + ASSERT_EQ(nrows % block_size, 0); + + KernelHandle kh; + +#ifdef TEST_SPILUK_FULL_CHECKS + // Check block_size=1 produces identical result to unblocked + run_and_check_spiluk_block(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, 1); +#endif + + // Convert to BSR + Crs crs("crs for block spiluk test", nrows, nrows, nnz, values, row_map, + entries); + Bsr bsr(crs, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); + } + static void run_test_spiluk_scale() { // Create a diagonally dominant sparse matrix to test: constexpr auto nrows = 5000; @@ -566,45 +711,6 @@ struct SpilukTest { kh_v[i].destroy_spiluk_handle(); } } - - static void run_test_spiluk_blocks() { - std::vector> A = get_9x9_fixture(); - - RowMapType row_map, brow_map; - EntriesType entries, bentries; - ValuesType values, bvalues; - - compress_matrix(row_map, entries, values, A); - - const size_type nrows = A.size(); - const size_type nnz = values.extent(0); - const lno_t fill_lev = 2; - const size_type block_size = nrows % 2 == 0 ? 2 : 3; - ASSERT_EQ(nrows % block_size, 0); - - KernelHandle kh; - - // Check block_size=1 produces identical result to unblocked - // run_and_check_spiluk_block(kh, row_map, entries, values, - // SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, 1); - - // Convert to BSR - Crs crs("crs for block spiluk test", nrows, nrows, nnz, values, row_map, - entries); - Bsr bsr(crs, block_size); - - // Pull out views from BSR - Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); - Kokkos::resize(bentries, bsr.graph.entries.extent(0)); - Kokkos::resize(bvalues, bsr.values.extent(0)); - Kokkos::deep_copy(brow_map, bsr.graph.row_map); - Kokkos::deep_copy(bentries, bsr.graph.entries); - Kokkos::deep_copy(bvalues, bsr.values); - - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, - block_size); - } }; } // namespace Test From 386fbeb4d760567d7c675c002958e103b89f3cac Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 30 Jan 2024 13:05:21 -0700 Subject: [PATCH 07/17] progress on spiluk --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 164 +++++++++++------- sparse/unit_test/Test_vector_fixtures.hpp | 2 +- 2 files changed, 106 insertions(+), 60 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index bbd2343cd3..a5f1a94e27 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -34,6 +34,7 @@ #include "KokkosBlas1_set.hpp" //#define NUMERIC_OUTPUT_INFO +//#define SPILUK_VERBOSE namespace KokkosSparse { namespace Impl { @@ -145,9 +146,6 @@ struct IlukWrap { } // lset_id - KOKKOS_INLINE_FUNCTION - void lset_id(const size_type nnz) const { L_values(nnz) = scalar_t(1.0); } - KOKKOS_INLINE_FUNCTION void lset_id(const member_type &team, const size_type nnz) const { // Not sure a Kokkos::single is really needed here since the @@ -191,6 +189,12 @@ struct IlukWrap { bool uequal(const size_type nnz, const scalar_t &value) const { return U_values(nnz) == value; } + + // print + KOKKOS_INLINE_FUNCTION + void print(const scalar_t &item) const { + std::cout << item << std::endl; + } }; // Partial specialization for block support @@ -216,24 +220,24 @@ struct IlukWrap { size_type block_items; sview_1d ones; + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + using LValuesUnmanaged2DBlockType = Kokkos::View< typename LValuesType::value_type **, - typename KokkosKernels::Impl::GetUnifiedLayout< - LValuesType>::array_layout, + Layout, typename LValuesType::device_type, Kokkos::MemoryTraits >; using UValuesUnmanaged2DBlockType = Kokkos::View< typename UValuesType::value_type **, - typename KokkosKernels::Impl::GetUnifiedLayout< - UValuesType>::array_layout, + Layout, typename UValuesType::device_type, Kokkos::MemoryTraits >; using AValuesUnmanaged2DBlockType = Kokkos::View< typename AValuesType::value_type **, - typename KokkosKernels::Impl::GetUnifiedLayout< - AValuesType>::array_layout, + Layout, typename AValuesType::device_type, Kokkos::MemoryTraits >; @@ -301,11 +305,6 @@ struct IlukWrap { } // lset_id - KOKKOS_INLINE_FUNCTION - void lset_id(const size_type block) const { - KokkosBatched::SerialSetIdentity::invoke(lget(block)); - } - KOKKOS_INLINE_FUNCTION void lset_id(const member_type &team, const size_type block) const { KokkosBatched::TeamSetIdentity::invoke(team, lget(block)); @@ -330,6 +329,7 @@ struct IlukWrap { } // gemm, alpha is hardcoded to -1, beta hardcoded to 1 + // C += -1 * A * B; template KOKKOS_INLINE_FUNCTION void gemm(const UValuesUnmanaged2DBlockType &A, const LValuesUnmanaged2DBlockType &B, @@ -376,6 +376,23 @@ struct IlukWrap { } return true; } + + // print + KOKKOS_INLINE_FUNCTION + void print(const LValuesUnmanaged2DBlockType &item) const { + for (size_type i = 0; i < block_size; ++i) { + std::cout << " "; + for (size_type j = 0; j < block_size; ++j) { + std::cout << item(i, j) << " "; + } + std::cout << std::endl; + } + // std::cout << " Raw: "; + // for (size_type i = 0; i < block_size * block_size; ++i) { + // std::cout << item.data()[i] << " "; + // } + // std::cout << std::endl; + } }; template Date: Wed, 31 Jan 2024 09:55:19 -0700 Subject: [PATCH 08/17] Progress. Block spiluk now checks out against analytical results --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 28 ++---- sparse/unit_test/Test_Sparse_spiluk.hpp | 96 ++++++++++++------- 2 files changed, 69 insertions(+), 55 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index a5f1a94e27..f45f129e7c 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -59,7 +59,6 @@ struct IlukWrap { using team_policy = typename IlukHandle::TeamPolicy; using member_type = typename team_policy::member_type; using range_policy = typename IlukHandle::RangePolicy; - using sview_1d = typename Kokkos::View; static team_policy get_team_policy(const size_type nrows, const int team_size) { @@ -162,10 +161,6 @@ struct IlukWrap { team.team_barrier(); } - // add. lhs += rhs - KOKKOS_INLINE_FUNCTION - void add(scalar_t &lhs, const scalar_t &rhs) const { lhs += rhs; } - // gemm, alpha is hardcoded to -1, beta hardcoded to 1 KOKKOS_INLINE_FUNCTION void gemm(const scalar_t &A, const scalar_t &B, scalar_t &C) const { @@ -218,7 +213,6 @@ struct IlukWrap { lno_t lev_start; size_type block_size; size_type block_items; - sview_1d ones; // BSR data is in LayoutRight! using Layout = Kokkos::LayoutRight; @@ -263,9 +257,7 @@ struct IlukWrap { iw(iw_), lev_start(lev_start_), block_size(block_size_), - block_items(block_size * block_size), - ones("ones", block_size) { - Kokkos::deep_copy(ones, 1.0); + block_items(block_size * block_size) { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); } @@ -322,12 +314,6 @@ struct IlukWrap { invoke(team, 1.0, rhs, lhs); } - // add. lhs += rhs - template - KOKKOS_INLINE_FUNCTION void add(Lview lhs, const Rview &rhs) const { - KokkosBatched::SerialAxpy::invoke(ones, rhs, lhs); - } - // gemm, alpha is hardcoded to -1, beta hardcoded to 1 // C += -1 * A * B; template @@ -467,13 +453,13 @@ struct IlukWrap { if (col < rowid) { Base::lset(ipos, Base::aget(k)); #ifdef SPILUK_VERBOSE - std::cout << " JGF Setting L[" << rowid << "][" << col << "] = " << std::endl; + std::cout << " JGF Setting L[" << rowid << "][" << col << "] = A[" << rowid << "][" << col << "] => " << std::endl; Base::print(Base::lget(ipos)); #endif } else { Base::uset(ipos, Base::aget(k)); #ifdef SPILUK_VERBOSE - std::cout << " JGF Setting U[" << rowid << "][" << col << "] = " << std::endl; + std::cout << " JGF Setting U[" << rowid << "][" << col << "] = A[" << rowid << "][" << col << "] => " << std::endl; Base::print(Base::uget(ipos)); #endif } @@ -490,10 +476,10 @@ struct IlukWrap { for (auto k = k1; k < k2; k++) { const auto prev_row = Base::L_entries(k); const auto udiag = Base::uget(Base::U_row_map(prev_row)); + Base::divide(team, Base::lget(k), udiag); auto fact = Base::lget(k); - Base::divide(team, fact, udiag); #ifdef SPILUK_VERBOSE - std::cout << " JGF Setting divide L[" << rowid << "][" << prev_row << "] /= U[" << prev_row << "][" << prev_row << "]" << std::endl; + std::cout << " JGF Setting divide L[" << rowid << "][" << prev_row << "] /= U[" << prev_row << "][" << prev_row << "] =" << std::endl; Base::print(fact); #endif Kokkos::parallel_for( @@ -507,7 +493,9 @@ struct IlukWrap { col < rowid ? Base::lget(ipos) : Base::uget(ipos); Base::gemm(fact, Base::uget(kk), C); #ifdef SPILUK_VERBOSE - std::cout << " JGF Setting Gemm " << (col < rowid ? "L" : "U") << "[" << rowid << "][" << col << "] =" << std::endl; + const auto icol = + col < rowid ? Base::L_entries(ipos) : Base::U_entries(ipos); + std::cout << " JGF Setting Gemm " << (col < rowid ? "L" : "U") << "[" << rowid << "][" << icol << "] -= fact * U[" << prev_row << "][" << col << "] =" << std::endl; Base::print(C); #endif } diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 41e4987176..053111669c 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -44,10 +44,25 @@ using kokkos_complex_float = Kokkos::complex; // Comment this out to do focussed debugging #define TEST_SPILUK_FULL_CHECKS +// Test verbosity level. 0 = none, 1 = print residuals, 2 = print L,U +#define TEST_SPILUK_VERBOSE_LEVEL 0 + +// #define TEST_SPILUK_TINY_TEST + namespace Test { +#ifdef TEST_SPILUK_TINY_TEST template -std::vector> get_9x9_fixture() { +std::vector> get_fixture() { + std::vector> A = {{10.00, 1.00, 0.00, 0.00}, + {0.00, 11.00, 0.00, 0.00}, + {0.00, 2.00, 12.00, 0.00}, + {5.00, 0.00, 3.00, 13.00}}; + return A; +} +#else +template +std::vector> get_fixture() { std::vector> A = { {10.00, 0.00, 0.30, 0.00, 0.00, 0.60, 0.00, 0.00, 0.00}, {0.00, 11.00, 0.00, 0.00, 0.00, 0.00, 0.70, 0.00, 0.00}, @@ -60,15 +75,7 @@ std::vector> get_9x9_fixture() { {0.00, 0.00, 0.00, 2.00, 2.50, 0.00, 0.00, 0.00, 18.00}}; return A; } - -template -std::vector> get_4x4_fixture() { - std::vector> A = {{10.00, 1.00, 0.00, 0.00}, - {0.00, 11.00, 0.00, 0.00}, - {0.00, 2.00, 12.00, 0.00}, - {5.00, 0.00, 0.00, 13.00}}; - return A; -} +#endif static constexpr double EPS = 1e-7; @@ -125,7 +132,7 @@ struct SpilukTest { static bool is_triangular(const RowMapType& drow_map, const EntriesType& dentries, const ValuesType& dvalues, bool check_lower, const size_type block_size = 1) { - const auto nrows = row_map.extent(0) - 1; + const auto nrows = drow_map.extent(0) - 1; const auto block_items = block_size * block_size; auto row_map = Kokkos::create_mirror_view(drow_map); @@ -147,17 +154,18 @@ struct SpilukTest { return false; } else if (col == row && block_size > 1) { + // Do the diagonal dense blocks also have to be upper/lower? // Check diagonal block - scalar_t* block = values.data() + nnz * block_items; - for (size_type i = 0; i < block_size; ++i) { - for (size_type j = 0; j < block_size; ++j) { - if ( (j > i && check_lower && block[i*block_size + j] != 0.0 ) || - (j < i && !check_lower && block[i*block_size + j] != 0.0) ) { - std::cout << "Bad block entry is: " << block[i*block_size + j] << std::endl; - return false; - } - } - } + // scalar_t* block = values.data() + nnz * block_items; + // for (size_type i = 0; i < block_size; ++i) { + // for (size_type j = 0; j < block_size; ++j) { + // if ( (j > i && check_lower && block[i*block_size + j] != 0.0 ) || + // (j < i && !check_lower && block[i*block_size + j] != 0.0) ) { + // std::cout << "Bad block entry is: " << block[i*block_size + j] << std::endl; + // return false; + // } + // } + // } } } } @@ -184,11 +192,15 @@ struct SpilukTest { EXPECT_TRUE(is_triangular(U_row_map, U_entries, U_values, false)); const auto result = check_result_impl(A, L, U, nrows); - // std::cout << "For nrows=" << nrows << ", unblocked had residual: " << result << std::endl; - // std::cout << "L" << std::endl; - // print_matrix(decompress_matrix(L_row_map, L_entries, L_values)); - // std::cout << "U" << std::endl; - // print_matrix(decompress_matrix(U_row_map, U_entries, U_values)); + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "For nrows=" << nrows << ", unblocked had residual: " << result << std::endl; + } + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "L result" << std::endl; + print_matrix(decompress_matrix(L_row_map, L_entries, L_values)); + std::cout << "U result" << std::endl; + print_matrix(decompress_matrix(U_row_map, U_entries, U_values)); + } EXPECT_LT(result, 1e-4); } @@ -212,11 +224,15 @@ struct SpilukTest { EXPECT_TRUE(is_triangular(U_row_map, U_entries, U_values, false, block_size)); const auto result = check_result_impl(A, L, U, nrows, block_size); - // std::cout << "For nrows=" << nrows << ", block_size=" << block_size << " had residual: " << result << std::endl; - // std::cout << "L" << std::endl; - // print_matrix(decompress_matrix(L_row_map, L_entries, L_values, block_size)); - // std::cout << "U" << std::endl; - // print_matrix(decompress_matrix(U_row_map, U_entries, U_values, block_size)); + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "For nrows=" << nrows << ", block_size=" << block_size << " had residual: " << result << std::endl; + } + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "L result" << std::endl; + print_matrix(decompress_matrix(L_row_map, L_entries, L_values, block_size)); + std::cout << "U result" << std::endl; + print_matrix(decompress_matrix(U_row_map, U_entries, U_values, block_size)); + } EXPECT_LT(result, 1e-2); } @@ -356,7 +372,12 @@ struct SpilukTest { } static void run_test_spiluk() { - std::vector> A = get_9x9_fixture(); + std::vector> A = get_fixture(); + + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "A input" << std::endl; + print_matrix(A); + } RowMapType row_map; EntriesType entries; @@ -373,7 +394,12 @@ struct SpilukTest { } static void run_test_spiluk_blocks() { - std::vector> A = get_9x9_fixture(); + std::vector> A = get_fixture(); + + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "A input" << std::endl; + print_matrix(A); + } RowMapType row_map, brow_map; EntriesType entries, bentries; @@ -507,7 +533,7 @@ struct SpilukTest { std::vector U_entries_v(nstreams); std::vector U_values_v(nstreams); - std::vector> Afix = get_9x9_fixture(); + std::vector> Afix = get_fixture(); RowMapType row_map; EntriesType entries; @@ -615,7 +641,7 @@ struct SpilukTest { std::vector U_entries_v(nstreams); std::vector U_values_v(nstreams); - std::vector> Afix = get_9x9_fixture(); + std::vector> Afix = get_fixture(); RowMapType row_map, brow_map; EntriesType entries, bentries; From a0eb7276dfe611c6bfaa11b5c1020bf23f71f0f7 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 2 Feb 2024 15:34:08 -0700 Subject: [PATCH 09/17] Progress --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 16 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 142 +++++++++++++----- 2 files changed, 109 insertions(+), 49 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index f45f129e7c..e5c20f283b 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -34,7 +34,7 @@ #include "KokkosBlas1_set.hpp" //#define NUMERIC_OUTPUT_INFO -//#define SPILUK_VERBOSE +#define SPILUK_VERBOSE namespace KokkosSparse { namespace Impl { @@ -373,11 +373,6 @@ struct IlukWrap { } std::cout << std::endl; } - // std::cout << " Raw: "; - // for (size_type i = 0; i < block_size * block_size; ++i) { - // std::cout << item.data()[i] << " "; - // } - // std::cout << std::endl; } }; @@ -479,7 +474,7 @@ struct IlukWrap { Base::divide(team, Base::lget(k), udiag); auto fact = Base::lget(k); #ifdef SPILUK_VERBOSE - std::cout << " JGF Setting divide L[" << rowid << "][" << prev_row << "] /= U[" << prev_row << "][" << prev_row << "] =" << std::endl; + std::cout << " JGF doing divide, fact = L[" << rowid << "][" << prev_row << "] /= U[" << prev_row << "][" << prev_row << "] =" << std::endl; Base::print(fact); #endif Kokkos::parallel_for( @@ -489,13 +484,14 @@ struct IlukWrap { const auto col = Base::U_entries(kk); const auto ipos = Base::iw(my_team, col); if (ipos != -1) { + const bool do_l = (col < rowid); //|| (col == 2 && rowid == 2 && prev_row == 1); typename Base::reftype C = - col < rowid ? Base::lget(ipos) : Base::uget(ipos); + do_l ? Base::lget(ipos) : Base::uget(ipos); Base::gemm(fact, Base::uget(kk), C); #ifdef SPILUK_VERBOSE const auto icol = - col < rowid ? Base::L_entries(ipos) : Base::U_entries(ipos); - std::cout << " JGF Setting Gemm " << (col < rowid ? "L" : "U") << "[" << rowid << "][" << icol << "] -= fact * U[" << prev_row << "][" << col << "] =" << std::endl; + do_l ? Base::L_entries(ipos) : Base::U_entries(ipos); + std::cout << " JGF doing gemm, " << (do_l ? "L" : "U") << "[" << rowid << "][" << icol << "] -= fact * U[" << prev_row << "][" << col << "] =" << std::endl; Base::print(C); #endif } diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 053111669c..32e7519dab 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -32,6 +32,7 @@ #include "Test_vector_fixtures.hpp" #include +#include using namespace KokkosSparse; using namespace KokkosSparse::Experimental; @@ -42,10 +43,10 @@ using kokkos_complex_double = Kokkos::complex; using kokkos_complex_float = Kokkos::complex; // Comment this out to do focussed debugging -#define TEST_SPILUK_FULL_CHECKS +// #define TEST_SPILUK_FULL_CHECKS // Test verbosity level. 0 = none, 1 = print residuals, 2 = print L,U -#define TEST_SPILUK_VERBOSE_LEVEL 0 +#define TEST_SPILUK_VERBOSE_LEVEL 2 // #define TEST_SPILUK_TINY_TEST @@ -172,6 +173,40 @@ struct SpilukTest { return true; } + static void populate_bsr(const ValuesType& dvalues, const size_type block_size) + { + using RPDF = std::uniform_real_distribution; + + const size_type block_items = block_size * block_size; + const size_type num_blocks = dvalues.extent(0) / block_items; + const scalar_t ZERO = scalar_t(0); + std::mt19937_64 engine; + + auto values = Kokkos::create_mirror_view(dvalues); + Kokkos::deep_copy(values, dvalues); + + for (size_type block = 0; block < num_blocks; ++block) { + scalar_t min = std::numeric_limits::max(); + scalar_t max = std::numeric_limits::min(); + // Get the range of values in this block + for (size_type block_item = 0; block_item < block_items; ++block_item) { + const scalar_t val = values(block * block_items + block_item); + if (val < min) min = val; + if (val > max) max = val; + } + // Set the zeros to a random value in this range + RPDF val_gen(min, max); + for (size_type block_item = 0; block_item < block_items; ++block_item) { + scalar_t& val = values(block * block_items + block_item); + if (val == ZERO) { + val = val_gen(engine); + } + } + } + + Kokkos::deep_copy(dvalues, values); + } + static void check_result(const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const RowMapType& L_row_map, @@ -271,6 +306,9 @@ struct SpilukTest { Kokkos::fence(); + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "For fill_level=" << fill_lev << ", "; + } check_result(row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); @@ -332,6 +370,9 @@ struct SpilukTest { Kokkos::fence(); + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "For fill_level=" << fill_lev << ", "; + } check_result_block(row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, block_size); @@ -457,12 +498,13 @@ struct SpilukTest { Kokkos::deep_copy(entries, A.graph.entries); Kokkos::deep_copy(values, A.values); - const lno_t fill_lev = 2; + for (lno_t fill_lev = 2; fill_lev < 3; ++fill_lev) { - KernelHandle kh; + KernelHandle kh; - run_and_check_spiluk(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + } } static void run_test_spiluk_scale_blocks() { @@ -474,31 +516,53 @@ struct SpilukTest { EntriesType bentries; ValuesType bvalues; - const size_type block_size = 10; + //const size_type block_size = 10; size_type nnz = 10 * nrows; auto A = - KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( - nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); - // Pull out views from CRS - Bsr bsr(A, block_size); + std::vector block_sizes = {/*1, 2, 4,*/ 10}; - // Pull out views from BSR - Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); - Kokkos::resize(bentries, bsr.graph.entries.extent(0)); - Kokkos::resize(bvalues, bsr.values.extent(0)); - Kokkos::deep_copy(brow_map, bsr.graph.row_map); - Kokkos::deep_copy(bentries, bsr.graph.entries); - Kokkos::deep_copy(bvalues, bsr.values); + for (auto block_size : block_sizes) { - const lno_t fill_lev = 2; + // Pull out views from CRS + Bsr bsr(A, block_size); - KernelHandle kh; + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, - block_size); + // Fully fill / populate the dense blocks of the BSR? + populate_bsr(bvalues, block_size); + Kokkos::deep_copy(bsr.values, bvalues); + + for (lno_t fill_lev = 2; fill_lev < 3; ++fill_lev) { + + KernelHandle kh; + + // auto crs = KokkosSparse::Impl::bsr_to_crs(bsr); + + // RowMapType row_map("row_map", crs.graph.row_map.extent(0)); + // EntriesType entries("entries", crs.graph.entries.extent(0)); + // ValuesType values("values", crs.values.extent(0)); + // Kokkos::deep_copy(row_map, crs.graph.row_map); + // Kokkos::deep_copy(entries, crs.graph.entries); + // Kokkos::deep_copy(values, crs.values); + + // run_and_check_spiluk(kh, row_map, entries, values, + // SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + + run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); + } + } } static void run_test_spiluk_streams(SPILUKAlgorithm test_algo, int nstreams) { @@ -745,10 +809,10 @@ template void test_spiluk() { using TestStruct = Test::SpilukTest; - TestStruct::run_test_spiluk(); + // TestStruct::run_test_spiluk(); TestStruct::run_test_spiluk_blocks(); - TestStruct::run_test_spiluk_scale(); - TestStruct::run_test_spiluk_scale_blocks(); + // TestStruct::run_test_spiluk_scale(); + // TestStruct::run_test_spiluk_scale_blocks(); } template ; - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 2); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 3); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 4); - - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, - 1); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, - 2); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, - 3); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, - 4); + // TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); + // TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 2); + // TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 3); + // TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 4); + + // TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + // 1); + // TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + // 2); + // TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + // 3); + // TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + // 4); } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From ab2ecd3f8c4ad1fafc60ea5b3f8063357e00d8e3 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 12 Feb 2024 16:07:48 -0700 Subject: [PATCH 10/17] LUPrec test with spiluk woring --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 31 +- sparse/src/KokkosSparse_LUPrec.hpp | 86 ++++-- sparse/unit_test/Test_Sparse_spiluk.hpp | 279 +++++++++++------- 3 files changed, 236 insertions(+), 160 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index e5c20f283b..ff7800e938 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -34,7 +34,6 @@ #include "KokkosBlas1_set.hpp" //#define NUMERIC_OUTPUT_INFO -#define SPILUK_VERBOSE namespace KokkosSparse { namespace Impl { @@ -405,9 +404,6 @@ struct IlukWrap { const auto my_team = team.league_rank(); const auto rowid = Base::level_idx(my_team + Base::lev_start); // map to rowid -#ifdef SPILUK_VERBOSE - std::cout << "JGF Processing rowid=" << rowid << std::endl; -#endif // Set active entries in L to zero, store active cols in iw // Set L diagonal for this row to identity @@ -436,9 +432,6 @@ struct IlukWrap { team.team_barrier(); // Unpack the rowid-th row of A, copy into L,U -#ifdef SPILUK_VERBOSE - std::cout << " JGF Unpacking A" << std::endl; -#endif k1 = Base::A_row_map(rowid); k2 = Base::A_row_map(rowid + 1); Kokkos::parallel_for( @@ -447,25 +440,14 @@ struct IlukWrap { const auto ipos = Base::iw(my_team, col); if (col < rowid) { Base::lset(ipos, Base::aget(k)); -#ifdef SPILUK_VERBOSE - std::cout << " JGF Setting L[" << rowid << "][" << col << "] = A[" << rowid << "][" << col << "] => " << std::endl; - Base::print(Base::lget(ipos)); -#endif } else { Base::uset(ipos, Base::aget(k)); -#ifdef SPILUK_VERBOSE - std::cout << " JGF Setting U[" << rowid << "][" << col << "] = A[" << rowid << "][" << col << "] => " << std::endl; - Base::print(Base::uget(ipos)); -#endif } }); team.team_barrier(); // Eliminate prev rows -#ifdef SPILUK_VERBOSE - std::cout << " JGF Eliminating previous rows" << std::endl; -#endif k1 = Base::L_row_map(rowid); k2 = Base::L_row_map(rowid + 1) - 1; for (auto k = k1; k < k2; k++) { @@ -473,10 +455,6 @@ struct IlukWrap { const auto udiag = Base::uget(Base::U_row_map(prev_row)); Base::divide(team, Base::lget(k), udiag); auto fact = Base::lget(k); -#ifdef SPILUK_VERBOSE - std::cout << " JGF doing divide, fact = L[" << rowid << "][" << prev_row << "] /= U[" << prev_row << "][" << prev_row << "] =" << std::endl; - Base::print(fact); -#endif Kokkos::parallel_for( Kokkos::TeamThreadRange(team, Base::U_row_map(prev_row) + 1, Base::U_row_map(prev_row + 1)), @@ -484,16 +462,9 @@ struct IlukWrap { const auto col = Base::U_entries(kk); const auto ipos = Base::iw(my_team, col); if (ipos != -1) { - const bool do_l = (col < rowid); //|| (col == 2 && rowid == 2 && prev_row == 1); typename Base::reftype C = - do_l ? Base::lget(ipos) : Base::uget(ipos); + col < rowid ? Base::lget(ipos) : Base::uget(ipos); Base::gemm(fact, Base::uget(kk), C); -#ifdef SPILUK_VERBOSE - const auto icol = - do_l ? Base::L_entries(ipos) : Base::U_entries(ipos); - std::cout << " JGF doing gemm, " << (do_l ? "L" : "U") << "[" << rowid << "][" << icol << "] -= fact * U[" << prev_row << "][" << col << "] =" << std::endl; - Base::print(C); -#endif } }); // end for kk diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index a257b8f09c..bae3b8b9ea 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace KokkosSparse { namespace Experimental { @@ -45,8 +46,9 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using MEMSP = typename CRS::memory_space; + using DEVC = typename Kokkos::Device; using karith = typename Kokkos::ArithTraits; - using View1d = typename Kokkos::View; + using View1d = typename Kokkos::View; private: // trsm takes host views @@ -61,11 +63,11 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { LUPrec(const CRSArg &L, const CRSArg &U) : _L(L), _U(U), - _tmp("LUPrec::_tmp", L.numRows()), - _tmp2("LUPrec::_tmp", L.numRows()), + _tmp("LUPrec::_tmp", L.numPointRows()), + _tmp2("LUPrec::_tmp", L.numPointRows()), _khL(), _khU() { - KK_REQUIRE_MSG(L.numRows() == U.numRows(), + KK_REQUIRE_MSG(L.numPointRows() == U.numPointRows(), "LUPrec: L.numRows() != U.numRows()"); _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), @@ -80,22 +82,12 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { _khU.destroy_sptrsv_handle(); } - ///// \brief Apply the preconditioner to X, putting the result in Y. - ///// - ///// \tparam XViewType Input vector, as a 1-D Kokkos::View - ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View - ///// - ///// \param transM [in] Not used. - ///// \param alpha [in] Not used - ///// \param beta [in] Not used. - ///// - ///// It takes L and U and the stores U^inv L^inv X in Y - // - virtual void apply( - const Kokkos::View> &X, - const Kokkos::View> &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + template ::value>::type* = nullptr> + void apply_impl( + const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp @@ -111,6 +103,60 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { KokkosBlas::axpby(alpha, _tmp2, beta, Y); } + + template ::value>::type* = nullptr> + void apply_impl( + const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + // tmp = trsv(L, x); //Apply L^inv to x + // y = trsv(U, tmp); //Apply U^inv to tmp + + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], + "LUPrec::apply only supports 'N' for transM"); + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + using Layout = Kokkos::LayoutLeft; +#else + using Layout = Kokkos::LayoutRight; +#endif + + // trsv is implemented for MV so we need to convert our views + using UView2d = typename Kokkos::View >; + using UView2dc = typename Kokkos::View >; + UView2dc X2d(X.data(), X.extent(0), 1); + UView2d + Y2d(Y.data(), Y.extent(0), 1), + tmp2d(_tmp.data(), _tmp.extent(0), 1), + tmp22d(_tmp2.data(), _tmp2.extent(0), 1); + + KokkosSparse::trsv("L", "N", "N", _L, X2d, tmp2d); + KokkosSparse::trsv("U", "N", "N", _U, tmp2d, tmp22d); + + KokkosBlas::axpby(alpha, _tmp2, beta, Y); + } + + ///// \brief Apply the preconditioner to X, putting the result in Y. + ///// + ///// \tparam XViewType Input vector, as a 1-D Kokkos::View + ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View + ///// + ///// \param transM [in] Not used. + ///// \param alpha [in] Not used + ///// \param beta [in] Not used. + ///// + ///// It takes L and U and the stores U^inv L^inv X in Y + // + virtual void apply( + const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + apply_impl(X, Y, transM, alpha, beta); + } //@} //! Set this preconditioner's parameters. diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 32e7519dab..a1f099c102 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -28,6 +28,7 @@ #include "KokkosSparse_spiluk.hpp" #include "KokkosSparse_crs_to_bsr_impl.hpp" #include "KokkosSparse_bsr_to_crs_impl.hpp" +#include "KokkosSparse_LUPrec.hpp" #include "Test_vector_fixtures.hpp" @@ -43,10 +44,10 @@ using kokkos_complex_double = Kokkos::complex; using kokkos_complex_float = Kokkos::complex; // Comment this out to do focussed debugging -// #define TEST_SPILUK_FULL_CHECKS +#define TEST_SPILUK_FULL_CHECKS // Test verbosity level. 0 = none, 1 = print residuals, 2 = print L,U -#define TEST_SPILUK_VERBOSE_LEVEL 2 +#define TEST_SPILUK_VERBOSE_LEVEL 0 // #define TEST_SPILUK_TINY_TEST @@ -131,17 +132,14 @@ struct SpilukTest { return diff_nrm / bb_nrm; } - static bool is_triangular(const RowMapType& drow_map, const EntriesType& dentries, const ValuesType& dvalues, bool check_lower, const size_type block_size = 1) + static bool is_triangular(const RowMapType& drow_map, const EntriesType& dentries, bool check_lower) { const auto nrows = drow_map.extent(0) - 1; - const auto block_items = block_size * block_size; auto row_map = Kokkos::create_mirror_view(drow_map); auto entries = Kokkos::create_mirror_view(dentries); - auto values = Kokkos::create_mirror_view(dvalues); Kokkos::deep_copy(row_map, drow_map); Kokkos::deep_copy(entries, dentries); - Kokkos::deep_copy(values, dvalues); for (size_type row = 0; row < nrows; ++row) { const auto row_nnz_begin = row_map(row); @@ -154,59 +152,11 @@ struct SpilukTest { else if (col < row && !check_lower) { return false; } - else if (col == row && block_size > 1) { - // Do the diagonal dense blocks also have to be upper/lower? - // Check diagonal block - // scalar_t* block = values.data() + nnz * block_items; - // for (size_type i = 0; i < block_size; ++i) { - // for (size_type j = 0; j < block_size; ++j) { - // if ( (j > i && check_lower && block[i*block_size + j] != 0.0 ) || - // (j < i && !check_lower && block[i*block_size + j] != 0.0) ) { - // std::cout << "Bad block entry is: " << block[i*block_size + j] << std::endl; - // return false; - // } - // } - // } - } } } return true; } - static void populate_bsr(const ValuesType& dvalues, const size_type block_size) - { - using RPDF = std::uniform_real_distribution; - - const size_type block_items = block_size * block_size; - const size_type num_blocks = dvalues.extent(0) / block_items; - const scalar_t ZERO = scalar_t(0); - std::mt19937_64 engine; - - auto values = Kokkos::create_mirror_view(dvalues); - Kokkos::deep_copy(values, dvalues); - - for (size_type block = 0; block < num_blocks; ++block) { - scalar_t min = std::numeric_limits::max(); - scalar_t max = std::numeric_limits::min(); - // Get the range of values in this block - for (size_type block_item = 0; block_item < block_items; ++block_item) { - const scalar_t val = values(block * block_items + block_item); - if (val < min) min = val; - if (val > max) max = val; - } - // Set the zeros to a random value in this range - RPDF val_gen(min, max); - for (size_type block_item = 0; block_item < block_items; ++block_item) { - scalar_t& val = values(block * block_items + block_item); - if (val == ZERO) { - val = val_gen(engine); - } - } - } - - Kokkos::deep_copy(dvalues, values); - } - static void check_result(const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const RowMapType& L_row_map, @@ -214,7 +164,8 @@ struct SpilukTest { const ValuesType& L_values, const RowMapType& U_row_map, const EntriesType& U_entries, - const ValuesType& U_values) { + const ValuesType& U_values, + const lno_t fill_lev) { // Checking const auto nrows = row_map.extent(0) - 1; Crs A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries); @@ -223,12 +174,12 @@ struct SpilukTest { Crs U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, U_entries); - EXPECT_TRUE(is_triangular(L_row_map, L_entries, L_values, true)); - EXPECT_TRUE(is_triangular(U_row_map, U_entries, U_values, false)); + EXPECT_TRUE(is_triangular(L_row_map, L_entries, true)); + EXPECT_TRUE(is_triangular(U_row_map, U_entries, false)); const auto result = check_result_impl(A, L, U, nrows); if (TEST_SPILUK_VERBOSE_LEVEL > 0) { - std::cout << "For nrows=" << nrows << ", unblocked had residual: " << result << std::endl; + std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev << ", unblocked had residual: " << result << std::endl; } if (TEST_SPILUK_VERBOSE_LEVEL > 1) { std::cout << "L result" << std::endl; @@ -237,7 +188,9 @@ struct SpilukTest { print_matrix(decompress_matrix(U_row_map, U_entries, U_values)); } - EXPECT_LT(result, 1e-4); + if (fill_lev > 1) { + EXPECT_LT(result, 1e-4); + } } static void check_result_block( @@ -245,7 +198,7 @@ struct SpilukTest { const ValuesType& values, const RowMapType& L_row_map, const EntriesType& L_entries, const ValuesType& L_values, const RowMapType& U_row_map, const EntriesType& U_entries, - const ValuesType& U_values, const size_type block_size) { + const ValuesType& U_values, const lno_t fill_lev, const size_type block_size) { // Checking const auto nrows = row_map.extent(0) - 1; Bsr A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries, @@ -255,12 +208,12 @@ struct SpilukTest { Bsr U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, U_entries, block_size); - EXPECT_TRUE(is_triangular(L_row_map, L_entries, L_values, true, block_size)); - EXPECT_TRUE(is_triangular(U_row_map, U_entries, U_values, false, block_size)); + EXPECT_TRUE(is_triangular(L_row_map, L_entries, true)); + EXPECT_TRUE(is_triangular(U_row_map, U_entries, false)); const auto result = check_result_impl(A, L, U, nrows, block_size); if (TEST_SPILUK_VERBOSE_LEVEL > 0) { - std::cout << "For nrows=" << nrows << ", block_size=" << block_size << " had residual: " << result << std::endl; + std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev << ", block_size=" << block_size << " had residual: " << result << std::endl; } if (TEST_SPILUK_VERBOSE_LEVEL > 1) { std::cout << "L result" << std::endl; @@ -269,7 +222,9 @@ struct SpilukTest { print_matrix(decompress_matrix(U_row_map, U_entries, U_values, block_size)); } - EXPECT_LT(result, 1e-2); + if (fill_lev > 1) { + EXPECT_LT(result, 1e-2); + } } static std::tuple 0) { - std::cout << "For fill_level=" << fill_lev << ", "; - } check_result(row_map, entries, values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values); + U_row_map, U_entries, U_values, fill_lev); kh.destroy_spiluk_handle(); @@ -370,11 +322,8 @@ struct SpilukTest { Kokkos::fence(); - if (TEST_SPILUK_VERBOSE_LEVEL > 0) { - std::cout << "For fill_level=" << fill_lev << ", "; - } check_result_block(row_map, entries, values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, block_size); + U_row_map, U_entries, U_values, fill_lev, block_size); kh.destroy_spiluk_handle(); @@ -490,6 +439,8 @@ struct SpilukTest { KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + KokkosSparse::sort_crs_matrix(A); + // Pull out views from CRS RowMapType row_map("row_map", A.graph.row_map.extent(0)); EntriesType entries("entries", A.graph.entries.extent(0)); @@ -498,7 +449,7 @@ struct SpilukTest { Kokkos::deep_copy(entries, A.graph.entries); Kokkos::deep_copy(values, A.values); - for (lno_t fill_lev = 2; fill_lev < 3; ++fill_lev) { + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { KernelHandle kh; @@ -523,11 +474,13 @@ struct SpilukTest { KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); - std::vector block_sizes = {/*1, 2, 4,*/ 10}; + KokkosSparse::sort_crs_matrix(A); + + std::vector block_sizes = {1, 2, 4, 10}; for (auto block_size : block_sizes) { - // Pull out views from CRS + // Convert to BSR Bsr bsr(A, block_size); // Pull out views from BSR @@ -538,26 +491,10 @@ struct SpilukTest { Kokkos::deep_copy(bentries, bsr.graph.entries); Kokkos::deep_copy(bvalues, bsr.values); - // Fully fill / populate the dense blocks of the BSR? - populate_bsr(bvalues, block_size); - Kokkos::deep_copy(bsr.values, bvalues); - - for (lno_t fill_lev = 2; fill_lev < 3; ++fill_lev) { + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { KernelHandle kh; - // auto crs = KokkosSparse::Impl::bsr_to_crs(bsr); - - // RowMapType row_map("row_map", crs.graph.row_map.extent(0)); - // EntriesType entries("entries", crs.graph.entries.extent(0)); - // ValuesType values("values", crs.values.extent(0)); - // Kokkos::deep_copy(row_map, crs.graph.row_map); - // Kokkos::deep_copy(entries, crs.graph.entries); - // Kokkos::deep_copy(values, crs.values); - - // run_and_check_spiluk(kh, row_map, entries, values, - // SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, block_size); @@ -666,7 +603,8 @@ struct SpilukTest { for (int i = 0; i < nstreams; i++) { check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i]); + U_row_map_v[i], U_entries_v[i], U_values_v[i], + fill_lev); kh_v[i].destroy_spiluk_handle(); } @@ -796,11 +734,131 @@ struct SpilukTest { check_result_block(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], U_row_map_v[i], U_entries_v[i], U_values_v[i], - block_size); + fill_lev, block_size); kh_v[i].destroy_spiluk_handle(); } } + + static void run_test_spiluk_precond_blocks() + { + // Test using par_ilut as a preconditioner + // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? + + // Create a diagonally dominant sparse matrix to test: + // par_ilut settings max_iters, res_delta_stop, fill_in_limit, and + // async_update are all left as defaults + constexpr auto nrows = 5000; + constexpr auto m = 15; + constexpr auto diagDominance = 2; + constexpr auto tol = 1e-5; + constexpr auto fill_lev = 2; + constexpr bool verbose = false; + + RowMapType brow_map; + EntriesType bentries; + ValuesType bvalues; + + size_type nnz = 10 * nrows; + auto A_unblocked = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + KokkosSparse::sort_crs_matrix(A_unblocked); + + std::vector block_sizes = {/*1, 2, 4,*/ 10}; + + for (auto block_size : block_sizes) { + + // Convert to BSR + Bsr A(A_unblocked, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, A.graph.row_map.extent(0)); + Kokkos::resize(bentries, A.graph.entries.extent(0)); + Kokkos::resize(bvalues, A.values.extent(0)); + Kokkos::deep_copy(brow_map, A.graph.row_map); + Kokkos::deep_copy(bentries, A.graph.entries); + Kokkos::deep_copy(bvalues, A.values); + + // Make kernel handles + KernelHandle kh; + kh.create_gmres_handle(m, tol); + auto gmres_handle = kh.get_gmres_handle(); + gmres_handle->set_verbose(verbose); + using GMRESHandle = + typename std::remove_reference::type; + + const auto [L_row_map, L_entries, L_values, U_row_map, + U_entries, U_values] = + run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); + + + // Create BSRs + const auto bnrows = L_row_map.extent(0) - 1; + Bsr L("L_Mtx", bnrows, bnrows, L_values.extent(0), L_values, L_row_map, + L_entries, block_size); + Bsr U("U_Mtx", bnrows, bnrows, U_values.extent(0), U_values, U_row_map, + U_entries, block_size); + + // Set initial vectors: + ValuesType X("X", nrows); // Solution and initial guess + ValuesType Wj("Wj", nrows); // For checking residuals at end. + ValuesType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + nrows); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + int num_iters_plain(0), num_iters_precond(0); + + // Solve Ax = b + { + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_plain = gmres_handle->get_num_iters(); + + EXPECT_GT(num_iters_plain, 0); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } + + // Solve Ax = b with LU preconditioner. + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); + + // Make precond + KokkosSparse::Experimental::LUPrec myPrec(L, U); + + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); + + gmres(&kh, A, B, X, &myPrec); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_precond = gmres_handle->get_num_iters(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + EXPECT_LT(num_iters_precond, num_iters_plain); + } + } + } + }; } // namespace Test @@ -809,10 +867,11 @@ template void test_spiluk() { using TestStruct = Test::SpilukTest; - // TestStruct::run_test_spiluk(); + TestStruct::run_test_spiluk(); TestStruct::run_test_spiluk_blocks(); - // TestStruct::run_test_spiluk_scale(); - // TestStruct::run_test_spiluk_scale_blocks(); + TestStruct::run_test_spiluk_scale(); + TestStruct::run_test_spiluk_scale_blocks(); + TestStruct::run_test_spiluk_precond_blocks(); } template ; - // TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); - // TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 2); - // TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 3); - // TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 4); - - // TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, - // 1); - // TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, - // 2); - // TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, - // 3); - // TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, - // 4); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 2); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 3); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 4); + + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 1); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 2); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 3); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 4); } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From bf4228f04dbf33be8271decf58b6293fa848144d Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 12 Feb 2024 16:42:11 -0700 Subject: [PATCH 11/17] Fix pedantic --- sparse/unit_test/Test_Sparse_spiluk.hpp | 8 ++++---- sparse/unit_test/Test_vector_fixtures.hpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index a1f099c102..f96d68fe58 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -134,7 +134,7 @@ struct SpilukTest { static bool is_triangular(const RowMapType& drow_map, const EntriesType& dentries, bool check_lower) { - const auto nrows = drow_map.extent(0) - 1; + const size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); auto entries = Kokkos::create_mirror_view(dentries); @@ -142,10 +142,10 @@ struct SpilukTest { Kokkos::deep_copy(entries, dentries); for (size_type row = 0; row < nrows; ++row) { - const auto row_nnz_begin = row_map(row); - const auto row_nnz_end = row_map(row+1); + const size_type row_nnz_begin = row_map(row); + const size_type row_nnz_end = row_map(row+1); for (size_type nnz = row_nnz_begin; nnz < row_nnz_end; ++nnz) { - const auto col = entries(nnz); + const size_type col = entries(nnz); if (col > row && check_lower) { return false; } diff --git a/sparse/unit_test/Test_vector_fixtures.hpp b/sparse/unit_test/Test_vector_fixtures.hpp index 17b734629b..0abc8034c9 100644 --- a/sparse/unit_test/Test_vector_fixtures.hpp +++ b/sparse/unit_test/Test_vector_fixtures.hpp @@ -117,7 +117,7 @@ decompress_matrix(const RowMapT& row_map, const EntriesT& entries, template std::vector> decompress_matrix(const RowMapT& row_map, const EntriesT& entries, - const ValuesT& values, const int block_size) { + const ValuesT& values, typename RowMapT::const_value_type block_size) { using size_type = typename RowMapT::non_const_value_type; using scalar_t = typename ValuesT::non_const_value_type; From 09df094d6c7105e1faac0b4082d8eb2fa27085fd Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 13 Feb 2024 09:45:24 -0700 Subject: [PATCH 12/17] Disable spiluk LU test on non-host --- sparse/unit_test/Test_Sparse_spiluk.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index f96d68fe58..2e8ac8b602 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -755,6 +755,13 @@ struct SpilukTest { constexpr auto fill_lev = 2; constexpr bool verbose = false; + // Skip test if not on host. trsv only works on host + static constexpr bool is_host = std::is_same< + execution_space, typename Kokkos::DefaultHostExecutionSpace>::value; + if (!is_host) { + return; + } + RowMapType brow_map; EntriesType bentries; ValuesType bvalues; @@ -835,7 +842,7 @@ struct SpilukTest { gmres_handle->reset_handle(m, tol); gmres_handle->set_verbose(verbose); - // Make precond + // Make precond. KokkosSparse::Experimental::LUPrec myPrec(L, U); // reset X for next gmres call From db5d85709fd48da12f2333011c923d6837d55637 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 13 Feb 2024 10:01:21 -0700 Subject: [PATCH 13/17] formatting --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 101 +++++++++--------- sparse/src/KokkosSparse_LUPrec.hpp | 55 +++++----- sparse/unit_test/Test_Sparse_spiluk.hpp | 86 +++++++-------- sparse/unit_test/Test_vector_fixtures.hpp | 3 +- 4 files changed, 123 insertions(+), 122 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index ff7800e938..29a78625f8 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -186,9 +186,7 @@ struct IlukWrap { // print KOKKOS_INLINE_FUNCTION - void print(const scalar_t &item) const { - std::cout << item << std::endl; - } + void print(const scalar_t &item) const { std::cout << item << std::endl; } }; // Partial specialization for block support @@ -217,20 +215,17 @@ struct IlukWrap { using Layout = Kokkos::LayoutRight; using LValuesUnmanaged2DBlockType = Kokkos::View< - typename LValuesType::value_type **, - Layout, + typename LValuesType::value_type **, Layout, typename LValuesType::device_type, Kokkos::MemoryTraits >; using UValuesUnmanaged2DBlockType = Kokkos::View< - typename UValuesType::value_type **, - Layout, + typename UValuesType::value_type **, Layout, typename UValuesType::device_type, Kokkos::MemoryTraits >; using AValuesUnmanaged2DBlockType = Kokkos::View< - typename AValuesType::value_type **, - Layout, + typename AValuesType::value_type **, Layout, typename AValuesType::device_type, Kokkos::MemoryTraits >; @@ -368,7 +363,7 @@ struct IlukWrap { for (size_type i = 0; i < block_size; ++i) { std::cout << " "; for (size_type j = 0; j < block_size; ++j) { - std::cout << item(i, j) << " "; + std::cout << item(i, j) << " "; } std::cout << std::endl; } @@ -410,40 +405,40 @@ struct IlukWrap { size_type k1 = Base::L_row_map(rowid); size_type k2 = Base::L_row_map(rowid + 1) - 1; Base::lset_id(team, k2); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::L_entries(k); - Base::lset(k, 0.0); - Base::iw(my_team, col) = k; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::L_entries(k); + Base::lset(k, 0.0); + Base::iw(my_team, col) = k; + }); team.team_barrier(); // Set active entries in U to zero, store active cols in iw k1 = Base::U_row_map(rowid); k2 = Base::U_row_map(rowid + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::U_entries(k); - Base::uset(k, 0.0); - Base::iw(my_team, col) = k; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::U_entries(k); + Base::uset(k, 0.0); + Base::iw(my_team, col) = k; + }); team.team_barrier(); // Unpack the rowid-th row of A, copy into L,U k1 = Base::A_row_map(rowid); k2 = Base::A_row_map(rowid + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::A_entries(k); - const auto ipos = Base::iw(my_team, col); - if (col < rowid) { - Base::lset(ipos, Base::aget(k)); - } else { - Base::uset(ipos, Base::aget(k)); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::A_entries(k); + const auto ipos = Base::iw(my_team, col); + if (col < rowid) { + Base::lset(ipos, Base::aget(k)); + } else { + Base::uset(ipos, Base::aget(k)); + } + }); team.team_barrier(); @@ -456,17 +451,17 @@ struct IlukWrap { Base::divide(team, Base::lget(k), udiag); auto fact = Base::lget(k); Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, Base::U_row_map(prev_row) + 1, - Base::U_row_map(prev_row + 1)), - [&](const size_type kk) { - const auto col = Base::U_entries(kk); - const auto ipos = Base::iw(my_team, col); - if (ipos != -1) { - typename Base::reftype C = - col < rowid ? Base::lget(ipos) : Base::uget(ipos); - Base::gemm(fact, Base::uget(kk), C); - } - }); // end for kk + Kokkos::TeamThreadRange(team, Base::U_row_map(prev_row) + 1, + Base::U_row_map(prev_row + 1)), + [&](const size_type kk) { + const auto col = Base::U_entries(kk); + const auto ipos = Base::iw(my_team, col); + if (ipos != -1) { + typename Base::reftype C = + col < rowid ? Base::lget(ipos) : Base::uget(ipos); + Base::gemm(fact, Base::uget(kk), C); + } + }); // end for kk team.team_barrier(); } // end for k @@ -483,19 +478,19 @@ struct IlukWrap { // Reset k1 = Base::L_row_map(rowid); k2 = Base::L_row_map(rowid + 1) - 1; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::L_entries(k); - Base::iw(my_team, col) = -1; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::L_entries(k); + Base::iw(my_team, col) = -1; + }); k1 = Base::U_row_map(rowid); k2 = Base::U_row_map(rowid + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - const auto col = Base::U_entries(k); - Base::iw(my_team, col) = -1; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::U_entries(k); + Base::iw(my_team, col) = -1; + }); } }; diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index bae3b8b9ea..6d492634c5 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -48,7 +48,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using MEMSP = typename CRS::memory_space; using DEVC = typename Kokkos::Device; using karith = typename Kokkos::ArithTraits; - using View1d = typename Kokkos::View; + using View1d = typename Kokkos::View; private: // trsm takes host views @@ -82,12 +82,13 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { _khU.destroy_sptrsv_handle(); } - template ::value>::type* = nullptr> - void apply_impl( - const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + template < + typename Matrix, + typename std::enable_if::value>::type * = nullptr> + void apply_impl(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp @@ -104,12 +105,13 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { KokkosBlas::axpby(alpha, _tmp2, beta, Y); } - template ::value>::type* = nullptr> - void apply_impl( - const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + template < + typename Matrix, + typename std::enable_if::value>::type * = nullptr> + void apply_impl(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp @@ -123,15 +125,16 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { #endif // trsv is implemented for MV so we need to convert our views - using UView2d = typename Kokkos::View >; - using UView2dc = typename Kokkos::View >; + using UView2d = typename Kokkos::View< + ScalarType **, Layout, DEVC, + Kokkos::MemoryTraits >; + using UView2dc = typename Kokkos::View< + const ScalarType **, Layout, DEVC, + Kokkos::MemoryTraits >; UView2dc X2d(X.data(), X.extent(0), 1); - UView2d - Y2d(Y.data(), Y.extent(0), 1), - tmp2d(_tmp.data(), _tmp.extent(0), 1), - tmp22d(_tmp2.data(), _tmp2.extent(0), 1); + UView2d Y2d(Y.data(), Y.extent(0), 1), + tmp2d(_tmp.data(), _tmp.extent(0), 1), + tmp22d(_tmp2.data(), _tmp2.extent(0), 1); KokkosSparse::trsv("L", "N", "N", _L, X2d, tmp2d); KokkosSparse::trsv("U", "N", "N", _U, tmp2d, tmp22d); @@ -150,11 +153,11 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { ///// ///// It takes L and U and the stores U^inv L^inv X in Y // - virtual void apply( - const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + virtual void apply(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", + ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { apply_impl(X, Y, transM, alpha, beta); } //@} diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 2e8ac8b602..ff6beccbec 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -132,8 +132,8 @@ struct SpilukTest { return diff_nrm / bb_nrm; } - static bool is_triangular(const RowMapType& drow_map, const EntriesType& dentries, bool check_lower) - { + static bool is_triangular(const RowMapType& drow_map, + const EntriesType& dentries, bool check_lower) { const size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); @@ -143,13 +143,12 @@ struct SpilukTest { for (size_type row = 0; row < nrows; ++row) { const size_type row_nnz_begin = row_map(row); - const size_type row_nnz_end = row_map(row+1); + const size_type row_nnz_end = row_map(row + 1); for (size_type nnz = row_nnz_begin; nnz < row_nnz_end; ++nnz) { const size_type col = entries(nnz); if (col > row && check_lower) { return false; - } - else if (col < row && !check_lower) { + } else if (col < row && !check_lower) { return false; } } @@ -164,8 +163,7 @@ struct SpilukTest { const ValuesType& L_values, const RowMapType& U_row_map, const EntriesType& U_entries, - const ValuesType& U_values, - const lno_t fill_lev) { + const ValuesType& U_values, const lno_t fill_lev) { // Checking const auto nrows = row_map.extent(0) - 1; Crs A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries); @@ -179,7 +177,8 @@ struct SpilukTest { const auto result = check_result_impl(A, L, U, nrows); if (TEST_SPILUK_VERBOSE_LEVEL > 0) { - std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev << ", unblocked had residual: " << result << std::endl; + std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev + << ", unblocked had residual: " << result << std::endl; } if (TEST_SPILUK_VERBOSE_LEVEL > 1) { std::cout << "L result" << std::endl; @@ -198,7 +197,8 @@ struct SpilukTest { const ValuesType& values, const RowMapType& L_row_map, const EntriesType& L_entries, const ValuesType& L_values, const RowMapType& U_row_map, const EntriesType& U_entries, - const ValuesType& U_values, const lno_t fill_lev, const size_type block_size) { + const ValuesType& U_values, const lno_t fill_lev, + const size_type block_size) { // Checking const auto nrows = row_map.extent(0) - 1; Bsr A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries, @@ -213,13 +213,17 @@ struct SpilukTest { const auto result = check_result_impl(A, L, U, nrows, block_size); if (TEST_SPILUK_VERBOSE_LEVEL > 0) { - std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev << ", block_size=" << block_size << " had residual: " << result << std::endl; + std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev + << ", block_size=" << block_size << " had residual: " << result + << std::endl; } if (TEST_SPILUK_VERBOSE_LEVEL > 1) { std::cout << "L result" << std::endl; - print_matrix(decompress_matrix(L_row_map, L_entries, L_values, block_size)); + print_matrix( + decompress_matrix(L_row_map, L_entries, L_values, block_size)); std::cout << "U result" << std::endl; - print_matrix(decompress_matrix(U_row_map, U_entries, U_values, block_size)); + print_matrix( + decompress_matrix(U_row_map, U_entries, U_values, block_size)); } if (fill_lev > 1) { @@ -231,7 +235,8 @@ struct SpilukTest { EntriesType, ValuesType> run_and_check_spiluk(KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, - SPILUKAlgorithm alg, const lno_t fill_lev, const int team_size = -1) { + SPILUKAlgorithm alg, const lno_t fill_lev, + const int team_size = -1) { const size_type nrows = row_map.extent(0) - 1; kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows); @@ -288,10 +293,11 @@ struct SpilukTest { static std::tuple - run_and_check_spiluk_block( - KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, - const ValuesType& values, SPILUKAlgorithm alg, const lno_t fill_lev, - const size_type block_size, const int team_size = -1) { + run_and_check_spiluk_block(KernelHandle& kh, const RowMapType& row_map, + const EntriesType& entries, + const ValuesType& values, SPILUKAlgorithm alg, + const lno_t fill_lev, const size_type block_size, + const int team_size = -1) { const size_type block_items = block_size * block_size; const size_type nrows = row_map.extent(0) - 1; kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows, block_size); @@ -332,7 +338,8 @@ struct SpilukTest { if (block_size == 1) { const auto [L_row_map_u, L_entries_u, L_values_u, U_row_map_u, U_entries_u, U_values_u] = - run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev, team_size); + run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev, + team_size); EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_u, EPS); EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_u, EPS); @@ -346,7 +353,8 @@ struct SpilukTest { if (team_size != 1) { const auto [L_row_map_ts1, L_entries_ts1, L_values_ts1, U_row_map_ts1, U_entries_ts1, U_values_ts1] = - run_and_check_spiluk_block(kh, row_map, entries, values, alg, fill_lev, block_size, 1); + run_and_check_spiluk_block(kh, row_map, entries, values, alg, + fill_lev, block_size, 1); EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_ts1, EPS); EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_ts1, EPS); @@ -450,7 +458,6 @@ struct SpilukTest { Kokkos::deep_copy(values, A.values); for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { - KernelHandle kh; run_and_check_spiluk(kh, row_map, entries, values, @@ -467,19 +474,18 @@ struct SpilukTest { EntriesType bentries; ValuesType bvalues; - //const size_type block_size = 10; + // const size_type block_size = 10; size_type nnz = 10 * nrows; auto A = - KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( - nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); KokkosSparse::sort_crs_matrix(A); std::vector block_sizes = {1, 2, 4, 10}; for (auto block_size : block_sizes) { - // Convert to BSR Bsr bsr(A, block_size); @@ -492,7 +498,6 @@ struct SpilukTest { Kokkos::deep_copy(bvalues, bsr.values); for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { - KernelHandle kh; run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, @@ -603,8 +608,7 @@ struct SpilukTest { for (int i = 0; i < nstreams; i++) { check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], - fill_lev); + U_row_map_v[i], U_entries_v[i], U_values_v[i], fill_lev); kh_v[i].destroy_spiluk_handle(); } @@ -740,8 +744,7 @@ struct SpilukTest { } } - static void run_test_spiluk_precond_blocks() - { + static void run_test_spiluk_precond_blocks() { // Test using par_ilut as a preconditioner // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? @@ -756,8 +759,9 @@ struct SpilukTest { constexpr bool verbose = false; // Skip test if not on host. trsv only works on host - static constexpr bool is_host = std::is_same< - execution_space, typename Kokkos::DefaultHostExecutionSpace>::value; + static constexpr bool is_host = + std::is_same::value; if (!is_host) { return; } @@ -767,15 +771,15 @@ struct SpilukTest { ValuesType bvalues; size_type nnz = 10 * nrows; - auto A_unblocked = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( - nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + auto A_unblocked = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); KokkosSparse::sort_crs_matrix(A_unblocked); std::vector block_sizes = {/*1, 2, 4,*/ 10}; for (auto block_size : block_sizes) { - // Convert to BSR Bsr A(A_unblocked, block_size); @@ -793,14 +797,13 @@ struct SpilukTest { auto gmres_handle = kh.get_gmres_handle(); gmres_handle->set_verbose(verbose); using GMRESHandle = - typename std::remove_reference::type; - - const auto [L_row_map, L_entries, L_values, U_row_map, - U_entries, U_values] = - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, - block_size); + typename std::remove_reference::type; + const auto [L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values] = + run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); // Create BSRs const auto bnrows = L_row_map.extent(0) - 1; @@ -865,7 +868,6 @@ struct SpilukTest { } } } - }; } // namespace Test diff --git a/sparse/unit_test/Test_vector_fixtures.hpp b/sparse/unit_test/Test_vector_fixtures.hpp index 0abc8034c9..b17197a3e5 100644 --- a/sparse/unit_test/Test_vector_fixtures.hpp +++ b/sparse/unit_test/Test_vector_fixtures.hpp @@ -117,7 +117,8 @@ decompress_matrix(const RowMapT& row_map, const EntriesT& entries, template std::vector> decompress_matrix(const RowMapT& row_map, const EntriesT& entries, - const ValuesT& values, typename RowMapT::const_value_type block_size) { + const ValuesT& values, + typename RowMapT::const_value_type block_size) { using size_type = typename RowMapT::non_const_value_type; using scalar_t = typename ValuesT::non_const_value_type; From 7c8667dc29ac1e022e849822c2ef07c0c1d61762 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 14 Feb 2024 12:54:55 -0700 Subject: [PATCH 14/17] Enhancements to spiluk test --- sparse/unit_test/Test_Sparse_spiluk.hpp | 417 +++++++++++------------- 1 file changed, 193 insertions(+), 224 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index ff6beccbec..a4ad1c7b33 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -29,6 +29,7 @@ #include "KokkosSparse_crs_to_bsr_impl.hpp" #include "KokkosSparse_bsr_to_crs_impl.hpp" #include "KokkosSparse_LUPrec.hpp" +#include "KokkosSparse_gmres.hpp" #include "Test_vector_fixtures.hpp" @@ -79,6 +80,42 @@ std::vector> get_fixture() { } #endif +template < + typename MatrixType, + typename CRS, + typename std::enable_if::value>::type* = nullptr> +MatrixType get_A(CRS A_unblocked, const size_t ) { + return A_unblocked; +} + +template < + typename MatrixType, + typename CRS, + typename std::enable_if::value>::type* = nullptr> +MatrixType get_A(CRS A_unblocked, const size_t block_size) { + // Convert to BSR + MatrixType A(A_unblocked, block_size); + + return A; +} + +template < + typename MatrixType, typename RowMapType, typename EntriesType, typename ValuesType, + typename std::enable_if::value>::type* = nullptr> +MatrixType make_matrix(const char* name, const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const size_t) { + const auto nrows = row_map.extent(0) - 1; + return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, entries); +} + +template < + typename MatrixType, typename RowMapType, typename EntriesType, typename ValuesType, + typename std::enable_if::value>::type* = nullptr> +MatrixType make_matrix(const char* name, const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const size_t block_size) { + const auto nrows = row_map.extent(0) - 1; + return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, entries, block_size); +} + + static constexpr double EPS = 1e-7; template 0) { - std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev - << ", unblocked had residual: " << result << std::endl; - } - if (TEST_SPILUK_VERBOSE_LEVEL > 1) { - std::cout << "L result" << std::endl; - print_matrix(decompress_matrix(L_row_map, L_entries, L_values)); - std::cout << "U result" << std::endl; - print_matrix(decompress_matrix(U_row_map, U_entries, U_values)); - } - - if (fill_lev > 1) { - EXPECT_LT(result, 1e-4); - } - } - - static void check_result_block( + template + static void check_result( const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const RowMapType& L_row_map, const EntriesType& L_entries, const ValuesType& L_values, const RowMapType& U_row_map, const EntriesType& U_entries, - const ValuesType& U_values, const lno_t fill_lev, - const size_type block_size) { + const ValuesType& U_values, const lno_t fill_lev, const size_type block_size = 1) { + + using sp_matrix_type = std::conditional_t; + + KK_REQUIRE(UseBlocks || (block_size == 1)); + // Checking const auto nrows = row_map.extent(0) - 1; - Bsr A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries, - block_size); - Bsr L("L_Mtx", nrows, nrows, L_values.extent(0), L_values, L_row_map, - L_entries, block_size); - Bsr U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, - U_entries, block_size); + auto A = make_matrix("A_Mtx", row_map, entries, values, block_size); + auto L = make_matrix("L_Mtx", L_row_map, L_entries, L_values, block_size); + auto U = make_matrix("U_Mtx", U_row_map, U_entries, U_values, block_size); EXPECT_TRUE(is_triangular(L_row_map, L_entries, true)); EXPECT_TRUE(is_triangular(U_row_map, U_entries, false)); const auto result = check_result_impl(A, L, U, nrows, block_size); if (TEST_SPILUK_VERBOSE_LEVEL > 0) { - std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev - << ", block_size=" << block_size << " had residual: " << result - << std::endl; + std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev; + if (UseBlocks) { + std::cout << ", block_size=" << block_size; + } + else { + std::cout << ", unblocked"; + } + std::cout << " had residual: " << result << std::endl; } if (TEST_SPILUK_VERBOSE_LEVEL > 1) { std::cout << "L result" << std::endl; @@ -227,84 +235,32 @@ struct SpilukTest { } if (fill_lev > 1) { - EXPECT_LT(result, 1e-2); + if (UseBlocks) { + EXPECT_LT(result, 1e-2); + } + else { + EXPECT_LT(result, 1e-4); + } } } + template static std::tuple - run_and_check_spiluk(KernelHandle& kh, const RowMapType& row_map, - const EntriesType& entries, const ValuesType& values, - SPILUKAlgorithm alg, const lno_t fill_lev, - const int team_size = -1) { - const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows); - - auto spiluk_handle = kh.get_spiluk_handle(); - if (team_size != -1) { - spiluk_handle->set_team_size(team_size); - } - - // Allocate L and U as outputs - RowMapType L_row_map("L_row_map", nrows + 1); - EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); - RowMapType U_row_map("U_row_map", nrows + 1); - EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); - - spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, - U_row_map, U_entries); - - Kokkos::fence(); - - Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); - Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL()); - ValuesType U_values("U_values", spiluk_handle->get_nnzU()); - - spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, - L_entries, L_values, U_row_map, U_entries, U_values); - - Kokkos::fence(); + run_and_check_spiluk( + KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, + const ValuesType& values, SPILUKAlgorithm alg, const lno_t fill_lev, + const size_type block_size = 1) + { + KK_REQUIRE(UseBlocks || (block_size == 1)); - check_result(row_map, entries, values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, fill_lev); - - kh.destroy_spiluk_handle(); - -#ifdef TEST_SPILUK_FULL_CHECKS - // Check that team size = 1 produces same result - if (team_size != 1) { - const auto [L_row_map_ts1, L_entries_ts1, L_values_ts1, U_row_map_ts1, - U_entries_ts1, U_values_ts1] = - run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev, 1); - - EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_ts1, EPS); - EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_ts1, EPS); - EXPECT_NEAR_KK_1DVIEW(L_values, L_values_ts1, EPS); - EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_ts1, EPS); - EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_ts1, EPS); - EXPECT_NEAR_KK_1DVIEW(U_values, U_values_ts1, EPS); - } -#endif - - return std::make_tuple(L_row_map, L_entries, L_values, U_row_map, U_entries, - U_values); - } - - static std::tuple - run_and_check_spiluk_block(KernelHandle& kh, const RowMapType& row_map, - const EntriesType& entries, - const ValuesType& values, SPILUKAlgorithm alg, - const lno_t fill_lev, const size_type block_size, - const int team_size = -1) { const size_type block_items = block_size * block_size; const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows, block_size); + kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows, !UseBlocks ? 0 : block_size); auto spiluk_handle = kh.get_spiluk_handle(); - if (team_size != -1) { - spiluk_handle->set_team_size(team_size); + if (TeamSize != -1) { + spiluk_handle->set_team_size(TeamSize); } // Allocate L and U as outputs @@ -328,18 +284,17 @@ struct SpilukTest { Kokkos::fence(); - check_result_block(row_map, entries, values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, fill_lev, block_size); + check_result(row_map, entries, values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, fill_lev, block_size); kh.destroy_spiluk_handle(); #ifdef TEST_SPILUK_FULL_CHECKS // If block_size is 1, results should exactly match unblocked results - if (block_size == 1) { + if (block_size == 1 && UseBlocks) { const auto [L_row_map_u, L_entries_u, L_values_u, U_row_map_u, U_entries_u, U_values_u] = - run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev, - team_size); + run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev); EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_u, EPS); EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_u, EPS); @@ -350,11 +305,10 @@ struct SpilukTest { } // Check that team size = 1 produces same result - if (team_size != 1) { + if (TeamSize != 1) { const auto [L_row_map_ts1, L_entries_ts1, L_values_ts1, U_row_map_ts1, U_entries_ts1, U_values_ts1] = - run_and_check_spiluk_block(kh, row_map, entries, values, alg, - fill_lev, block_size, 1); + run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev, block_size); EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_ts1, EPS); EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_ts1, EPS); @@ -387,8 +341,8 @@ struct SpilukTest { KernelHandle kh; - run_and_check_spiluk(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); } static void run_test_spiluk_blocks() { @@ -413,28 +367,27 @@ struct SpilukTest { KernelHandle kh; -#ifdef TEST_SPILUK_FULL_CHECKS - // Check block_size=1 produces identical result to unblocked - run_and_check_spiluk_block(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, 1); -#endif - - // Convert to BSR Crs crs("crs for block spiluk test", nrows, nrows, nnz, values, row_map, entries); - Bsr bsr(crs, block_size); - // Pull out views from BSR - Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); - Kokkos::resize(bentries, bsr.graph.entries.extent(0)); - Kokkos::resize(bvalues, bsr.values.extent(0)); - Kokkos::deep_copy(brow_map, bsr.graph.row_map); - Kokkos::deep_copy(bentries, bsr.graph.entries); - Kokkos::deep_copy(bvalues, bsr.values); + std::vector block_sizes = {1, block_size}; + + for (auto block_size_itr : block_sizes) { + + Bsr bsr(crs, block_size_itr); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, - block_size); + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size_itr); + } } static void run_test_spiluk_scale() { @@ -460,8 +413,8 @@ struct SpilukTest { for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { KernelHandle kh; - run_and_check_spiluk(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); } } @@ -500,7 +453,7 @@ struct SpilukTest { for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { KernelHandle kh; - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, + run_and_check_spiluk(kh, brow_map, bentries, bvalues, SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, block_size); } @@ -606,9 +559,10 @@ struct SpilukTest { // Checking for (int i = 0; i < nstreams; i++) { - check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], fill_lev); + check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + fill_lev); kh_v[i].destroy_spiluk_handle(); } @@ -735,7 +689,7 @@ struct SpilukTest { // Checking for (int i = 0; i < nstreams; i++) { - check_result_block(A_row_map_v[i], A_entries_v[i], A_values_v[i], + check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], U_row_map_v[i], U_entries_v[i], U_values_v[i], fill_lev, block_size); @@ -744,26 +698,30 @@ struct SpilukTest { } } - static void run_test_spiluk_precond_blocks() { + template + static void run_test_spiluk_precond() + { // Test using par_ilut as a preconditioner // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? // Create a diagonally dominant sparse matrix to test: // par_ilut settings max_iters, res_delta_stop, fill_in_limit, and // async_update are all left as defaults + using sp_matrix_type = std::conditional_t; + constexpr auto nrows = 5000; constexpr auto m = 15; constexpr auto diagDominance = 2; constexpr auto tol = 1e-5; - constexpr auto fill_lev = 2; constexpr bool verbose = false; - // Skip test if not on host. trsv only works on host - static constexpr bool is_host = - std::is_same::value; - if (!is_host) { - return; + if (UseBlocks) { + // Skip test if not on host. block trsv only works on host + static constexpr bool is_host = std::is_same< + execution_space, typename Kokkos::DefaultHostExecutionSpace>::value; + if (!is_host) { + return; + } } RowMapType brow_map; @@ -777,11 +735,13 @@ struct SpilukTest { KokkosSparse::sort_crs_matrix(A_unblocked); - std::vector block_sizes = {/*1, 2, 4,*/ 10}; + std::vector block_sizes_blocked = {1, 2, 4, 10}; + std::vector block_sizes_unblocked = {1}; + std::vector block_sizes = UseBlocks ? block_sizes_blocked : block_sizes_unblocked; for (auto block_size : block_sizes) { - // Convert to BSR - Bsr A(A_unblocked, block_size); + // Convert to BSR if block enabled + auto A = get_A(A_unblocked, block_size); // Pull out views from BSR Kokkos::resize(brow_map, A.graph.row_map.extent(0)); @@ -797,74 +757,82 @@ struct SpilukTest { auto gmres_handle = kh.get_gmres_handle(); gmres_handle->set_verbose(verbose); using GMRESHandle = - typename std::remove_reference::type; - - const auto [L_row_map, L_entries, L_values, U_row_map, U_entries, - U_values] = - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, - block_size); - - // Create BSRs - const auto bnrows = L_row_map.extent(0) - 1; - Bsr L("L_Mtx", bnrows, bnrows, L_values.extent(0), L_values, L_row_map, - L_entries, block_size); - Bsr U("U_Mtx", bnrows, bnrows, U_values.extent(0), U_values, U_row_map, - U_entries, block_size); - - // Set initial vectors: - ValuesType X("X", nrows); // Solution and initial guess - ValuesType Wj("Wj", nrows); // For checking residuals at end. - ValuesType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), - nrows); // right-hand side vec - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B, 1.0); - - int num_iters_plain(0), num_iters_precond(0); - - // Solve Ax = b - { - gmres(&kh, A, B, X); - - // Double check residuals at end of solve: - float_t nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - float_t endRes = KokkosBlas::nrm2(B) / nrmB; - - const auto conv_flag = gmres_handle->get_conv_flag_val(); - num_iters_plain = gmres_handle->get_num_iters(); - - EXPECT_GT(num_iters_plain, 0); - EXPECT_LT(endRes, gmres_handle->get_tol()); - EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - } + typename std::remove_reference::type; + + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { - // Solve Ax = b with LU preconditioner. - { - gmres_handle->reset_handle(m, tol); - gmres_handle->set_verbose(verbose); + const auto [L_row_map, L_entries, L_values, U_row_map, + U_entries, U_values] = + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); + + // Create L, U + auto L = make_matrix("L_Mtx", L_row_map, L_entries, L_values, block_size); + auto U = make_matrix("U_Mtx", U_row_map, U_entries, U_values, block_size); + + // Set initial vectors: + ValuesType X("X", nrows); // Solution and initial guess + ValuesType Wj("Wj", nrows); // For checking residuals at end. + ValuesType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + nrows); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + int num_iters_plain(0), num_iters_precond(0); + + // Solve Ax = b + { + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_plain = gmres_handle->get_num_iters(); + + EXPECT_GT(num_iters_plain, 0); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "Without LUPrec, with block_size=" << block_size << ", converged in " << num_iters_plain << " steps with endres=" << endRes << std::endl; + } + } - // Make precond. - KokkosSparse::Experimental::LUPrec myPrec(L, U); + // Solve Ax = b with LU preconditioner. + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); - // reset X for next gmres call - Kokkos::deep_copy(X, 0.0); + // Make precond. + KokkosSparse::Experimental::LUPrec myPrec(L, U); - gmres(&kh, A, B, X, &myPrec); + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); - // Double check residuals at end of solve: - float_t nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - float_t endRes = KokkosBlas::nrm2(B) / nrmB; + gmres(&kh, A, B, X, &myPrec); - const auto conv_flag = gmres_handle->get_conv_flag_val(); - num_iters_precond = gmres_handle->get_num_iters(); + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; - EXPECT_LT(endRes, gmres_handle->get_tol()); - EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - EXPECT_LT(num_iters_precond, num_iters_plain); + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_precond = gmres_handle->get_num_iters(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + EXPECT_LT(num_iters_precond, num_iters_plain); + + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "With LUPrec, with block_size=" << block_size << ", and fill_level=" << fill_lev << ", converged in " << num_iters_precond << " steps with endres=" << endRes << std::endl; + } + } } } } @@ -880,7 +848,8 @@ void test_spiluk() { TestStruct::run_test_spiluk_blocks(); TestStruct::run_test_spiluk_scale(); TestStruct::run_test_spiluk_scale_blocks(); - TestStruct::run_test_spiluk_precond_blocks(); + TestStruct::template run_test_spiluk_precond(); + TestStruct::template run_test_spiluk_precond(); } template Date: Wed, 14 Feb 2024 12:59:56 -0700 Subject: [PATCH 15/17] formatting --- sparse/unit_test/Test_Sparse_spiluk.hpp | 138 +++++++++++++----------- 1 file changed, 78 insertions(+), 60 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index a4ad1c7b33..de7c56bb2e 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -81,17 +81,15 @@ std::vector> get_fixture() { #endif template < - typename MatrixType, - typename CRS, - typename std::enable_if::value>::type* = nullptr> -MatrixType get_A(CRS A_unblocked, const size_t ) { + typename MatrixType, typename CRS, + typename std::enable_if::value>::type* = nullptr> +MatrixType get_A(CRS A_unblocked, const size_t) { return A_unblocked; } template < - typename MatrixType, - typename CRS, - typename std::enable_if::value>::type* = nullptr> + typename MatrixType, typename CRS, + typename std::enable_if::value>::type* = nullptr> MatrixType get_A(CRS A_unblocked, const size_t block_size) { // Convert to BSR MatrixType A(A_unblocked, block_size); @@ -100,22 +98,29 @@ MatrixType get_A(CRS A_unblocked, const size_t block_size) { } template < - typename MatrixType, typename RowMapType, typename EntriesType, typename ValuesType, - typename std::enable_if::value>::type* = nullptr> -MatrixType make_matrix(const char* name, const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const size_t) { + typename MatrixType, typename RowMapType, typename EntriesType, + typename ValuesType, + typename std::enable_if::value>::type* = nullptr> +MatrixType make_matrix(const char* name, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const size_t) { const auto nrows = row_map.extent(0) - 1; - return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, entries); + return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, + entries); } template < - typename MatrixType, typename RowMapType, typename EntriesType, typename ValuesType, - typename std::enable_if::value>::type* = nullptr> -MatrixType make_matrix(const char* name, const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const size_t block_size) { + typename MatrixType, typename RowMapType, typename EntriesType, + typename ValuesType, + typename std::enable_if::value>::type* = nullptr> +MatrixType make_matrix(const char* name, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const size_t block_size) { const auto nrows = row_map.extent(0) - 1; - return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, entries, block_size); + return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, + entries, block_size); } - static constexpr double EPS = 1e-7; template - static void check_result( - const RowMapType& row_map, const EntriesType& entries, - const ValuesType& values, const RowMapType& L_row_map, - const EntriesType& L_entries, const ValuesType& L_values, - const RowMapType& U_row_map, const EntriesType& U_entries, - const ValuesType& U_values, const lno_t fill_lev, const size_type block_size = 1) { - + static void check_result(const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const RowMapType& L_row_map, + const EntriesType& L_entries, + const ValuesType& L_values, + const RowMapType& U_row_map, + const EntriesType& U_entries, + const ValuesType& U_values, const lno_t fill_lev, + const size_type block_size = 1) { using sp_matrix_type = std::conditional_t; KK_REQUIRE(UseBlocks || (block_size == 1)); // Checking const auto nrows = row_map.extent(0) - 1; - auto A = make_matrix("A_Mtx", row_map, entries, values, block_size); - auto L = make_matrix("L_Mtx", L_row_map, L_entries, L_values, block_size); - auto U = make_matrix("U_Mtx", U_row_map, U_entries, U_values, block_size); + auto A = make_matrix("A_Mtx", row_map, entries, values, + block_size); + auto L = make_matrix("L_Mtx", L_row_map, L_entries, + L_values, block_size); + auto U = make_matrix("U_Mtx", U_row_map, U_entries, + U_values, block_size); EXPECT_TRUE(is_triangular(L_row_map, L_entries, true)); EXPECT_TRUE(is_triangular(U_row_map, U_entries, false)); @@ -219,8 +229,7 @@ struct SpilukTest { std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev; if (UseBlocks) { std::cout << ", block_size=" << block_size; - } - else { + } else { std::cout << ", unblocked"; } std::cout << " had residual: " << result << std::endl; @@ -237,26 +246,25 @@ struct SpilukTest { if (fill_lev > 1) { if (UseBlocks) { EXPECT_LT(result, 1e-2); - } - else { + } else { EXPECT_LT(result, 1e-4); } } } - template + template static std::tuple - run_and_check_spiluk( - KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, - const ValuesType& values, SPILUKAlgorithm alg, const lno_t fill_lev, - const size_type block_size = 1) - { + run_and_check_spiluk(KernelHandle& kh, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + SPILUKAlgorithm alg, const lno_t fill_lev, + const size_type block_size = 1) { KK_REQUIRE(UseBlocks || (block_size == 1)); const size_type block_items = block_size * block_size; const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows, !UseBlocks ? 0 : block_size); + kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows, + !UseBlocks ? 0 : block_size); auto spiluk_handle = kh.get_spiluk_handle(); if (TeamSize != -1) { @@ -284,8 +292,9 @@ struct SpilukTest { Kokkos::fence(); - check_result(row_map, entries, values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, fill_lev, block_size); + check_result(row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, fill_lev, + block_size); kh.destroy_spiluk_handle(); @@ -294,7 +303,8 @@ struct SpilukTest { if (block_size == 1 && UseBlocks) { const auto [L_row_map_u, L_entries_u, L_values_u, U_row_map_u, U_entries_u, U_values_u] = - run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev); + run_and_check_spiluk(kh, row_map, entries, values, + alg, fill_lev); EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_u, EPS); EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_u, EPS); @@ -308,7 +318,8 @@ struct SpilukTest { if (TeamSize != 1) { const auto [L_row_map_ts1, L_entries_ts1, L_values_ts1, U_row_map_ts1, U_entries_ts1, U_values_ts1] = - run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev, block_size); + run_and_check_spiluk(kh, row_map, entries, values, alg, + fill_lev, block_size); EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_ts1, EPS); EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_ts1, EPS); @@ -373,7 +384,6 @@ struct SpilukTest { std::vector block_sizes = {1, block_size}; for (auto block_size_itr : block_sizes) { - Bsr bsr(crs, block_size_itr); // Pull out views from BSR @@ -699,8 +709,7 @@ struct SpilukTest { } template - static void run_test_spiluk_precond() - { + static void run_test_spiluk_precond() { // Test using par_ilut as a preconditioner // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? @@ -717,8 +726,9 @@ struct SpilukTest { if (UseBlocks) { // Skip test if not on host. block trsv only works on host - static constexpr bool is_host = std::is_same< - execution_space, typename Kokkos::DefaultHostExecutionSpace>::value; + static constexpr bool is_host = + std::is_same::value; if (!is_host) { return; } @@ -735,9 +745,10 @@ struct SpilukTest { KokkosSparse::sort_crs_matrix(A_unblocked); - std::vector block_sizes_blocked = {1, 2, 4, 10}; + std::vector block_sizes_blocked = {1, 2, 4, 10}; std::vector block_sizes_unblocked = {1}; - std::vector block_sizes = UseBlocks ? block_sizes_blocked : block_sizes_unblocked; + std::vector block_sizes = + UseBlocks ? block_sizes_blocked : block_sizes_unblocked; for (auto block_size : block_sizes) { // Convert to BSR if block enabled @@ -757,19 +768,20 @@ struct SpilukTest { auto gmres_handle = kh.get_gmres_handle(); gmres_handle->set_verbose(verbose); using GMRESHandle = - typename std::remove_reference::type; + typename std::remove_reference::type; for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { - - const auto [L_row_map, L_entries, L_values, U_row_map, - U_entries, U_values] = - run_and_check_spiluk(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, - block_size); + const auto [L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values] = + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, + fill_lev, block_size); // Create L, U - auto L = make_matrix("L_Mtx", L_row_map, L_entries, L_values, block_size); - auto U = make_matrix("U_Mtx", U_row_map, U_entries, U_values, block_size); + auto L = make_matrix("L_Mtx", L_row_map, L_entries, + L_values, block_size); + auto U = make_matrix("U_Mtx", U_row_map, U_entries, + U_values, block_size); // Set initial vectors: ValuesType X("X", nrows); // Solution and initial guess @@ -799,7 +811,9 @@ struct SpilukTest { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); if (TEST_SPILUK_VERBOSE_LEVEL > 0) { - std::cout << "Without LUPrec, with block_size=" << block_size << ", converged in " << num_iters_plain << " steps with endres=" << endRes << std::endl; + std::cout << "Without LUPrec, with block_size=" << block_size + << ", converged in " << num_iters_plain + << " steps with endres=" << endRes << std::endl; } } @@ -809,7 +823,8 @@ struct SpilukTest { gmres_handle->set_verbose(verbose); // Make precond. - KokkosSparse::Experimental::LUPrec myPrec(L, U); + KokkosSparse::Experimental::LUPrec + myPrec(L, U); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); @@ -830,7 +845,10 @@ struct SpilukTest { EXPECT_LT(num_iters_precond, num_iters_plain); if (TEST_SPILUK_VERBOSE_LEVEL > 0) { - std::cout << "With LUPrec, with block_size=" << block_size << ", and fill_level=" << fill_lev << ", converged in " << num_iters_precond << " steps with endres=" << endRes << std::endl; + std::cout << "With LUPrec, with block_size=" << block_size + << ", and fill_level=" << fill_lev << ", converged in " + << num_iters_precond << " steps with endres=" << endRes + << std::endl; } } } From e05bb9a3c13858c3eb8eef5469d795dc2efd9314 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 20 Feb 2024 13:39:43 -0700 Subject: [PATCH 16/17] Clean up a few issues uncovered by gh review --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 52 +++++++++---------- sparse/src/KokkosSparse_LUPrec.hpp | 20 +++---- sparse/unit_test/Test_Sparse_spiluk.hpp | 4 +- 3 files changed, 37 insertions(+), 39 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 29a78625f8..d4dfc2ed42 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -160,10 +160,10 @@ struct IlukWrap { team.team_barrier(); } - // gemm, alpha is hardcoded to -1, beta hardcoded to 1 + // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - void gemm(const scalar_t &A, const scalar_t &B, scalar_t &C) const { - C += -1 * A * B; + void multiply_subtract(const scalar_t &A, const scalar_t &B, scalar_t &C) const { + C -= A * B; } // lget @@ -214,22 +214,22 @@ struct IlukWrap { // BSR data is in LayoutRight! using Layout = Kokkos::LayoutRight; - using LValuesUnmanaged2DBlockType = Kokkos::View< + using LBlock = Kokkos::View< typename LValuesType::value_type **, Layout, typename LValuesType::device_type, Kokkos::MemoryTraits >; - using UValuesUnmanaged2DBlockType = Kokkos::View< + using UBlock = Kokkos::View< typename UValuesType::value_type **, Layout, typename UValuesType::device_type, Kokkos::MemoryTraits >; - using AValuesUnmanaged2DBlockType = Kokkos::View< + using ABlock = Kokkos::View< typename AValuesType::value_type **, Layout, typename AValuesType::device_type, Kokkos::MemoryTraits >; - using reftype = LValuesUnmanaged2DBlockType; + using reftype = LBlock; Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, @@ -264,7 +264,7 @@ struct IlukWrap { KOKKOS_INLINE_FUNCTION void lset(const size_type block, - const AValuesUnmanaged2DBlockType &rhs) const { + const ABlock &rhs) const { auto lblock = lget(block); for (size_type i = 0; i < block_size; ++i) { for (size_type j = 0; j < block_size; ++j) { @@ -281,7 +281,7 @@ struct IlukWrap { KOKKOS_INLINE_FUNCTION void uset(const size_type block, - const AValuesUnmanaged2DBlockType &rhs) const { + const ABlock &rhs) const { auto ublock = uget(block); for (size_type i = 0; i < block_size; ++i) { for (size_type j = 0; j < block_size; ++j) { @@ -298,8 +298,8 @@ struct IlukWrap { // divide. lhs /= rhs KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, const LValuesUnmanaged2DBlockType &lhs, - const UValuesUnmanaged2DBlockType &rhs) const { + void divide(const member_type &team, const LBlock &lhs, + const UBlock &rhs) const { KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, // not 100% on this @@ -308,38 +308,38 @@ struct IlukWrap { invoke(team, 1.0, rhs, lhs); } - // gemm, alpha is hardcoded to -1, beta hardcoded to 1 - // C += -1 * A * B; + // multiply_subtract. C -= A * B template - KOKKOS_INLINE_FUNCTION void gemm(const UValuesUnmanaged2DBlockType &A, - const LValuesUnmanaged2DBlockType &B, - CView &C) const { + KOKKOS_INLINE_FUNCTION void multiply_subtract(const UBlock &A, + const LBlock &B, + CView &C) const { + // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 KokkosBatched::SerialGemm:: - invoke( + invoke( -1.0, A, B, 1.0, C); } // lget KOKKOS_INLINE_FUNCTION - LValuesUnmanaged2DBlockType lget(const size_type block) const { - return LValuesUnmanaged2DBlockType( + LBlock lget(const size_type block) const { + return LBlock( L_values.data() + (block * block_items), block_size, block_size); } // uget KOKKOS_INLINE_FUNCTION - UValuesUnmanaged2DBlockType uget(const size_type block) const { - return UValuesUnmanaged2DBlockType( + UBlock uget(const size_type block) const { + return UBlock( U_values.data() + (block * block_items), block_size, block_size); } // aget KOKKOS_INLINE_FUNCTION - AValuesUnmanaged2DBlockType aget(const size_type block) const { - return AValuesUnmanaged2DBlockType( + ABlock aget(const size_type block) const { + return ABlock( A_values.data() + (block * block_items), block_size, block_size); } @@ -359,7 +359,7 @@ struct IlukWrap { // print KOKKOS_INLINE_FUNCTION - void print(const LValuesUnmanaged2DBlockType &item) const { + void print(const LBlock &item) const { for (size_type i = 0; i < block_size; ++i) { std::cout << " "; for (size_type j = 0; j < block_size; ++j) { @@ -459,7 +459,7 @@ struct IlukWrap { if (ipos != -1) { typename Base::reftype C = col < rowid ? Base::lget(ipos) : Base::uget(ipos); - Base::gemm(fact, Base::uget(kk), C); + Base::multiply_subtract(fact, Base::uget(kk), C); } }); // end for kk diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 6d492634c5..d687c8dd4f 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -46,9 +46,9 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using MEMSP = typename CRS::memory_space; - using DEVC = typename Kokkos::Device; + using DEVICE = typename Kokkos::Device; using karith = typename Kokkos::ArithTraits; - using View1d = typename Kokkos::View; + using View1d = typename Kokkos::View; private: // trsm takes host views @@ -85,8 +85,8 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { template < typename Matrix, typename std::enable_if::value>::type * = nullptr> - void apply_impl(const Kokkos::View &X, - const Kokkos::View &Y, + void apply_impl(const Kokkos::View &X, + const Kokkos::View &Y, const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x @@ -108,8 +108,8 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { template < typename Matrix, typename std::enable_if::value>::type * = nullptr> - void apply_impl(const Kokkos::View &X, - const Kokkos::View &Y, + void apply_impl(const Kokkos::View &X, + const Kokkos::View &Y, const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x @@ -126,10 +126,10 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { // trsv is implemented for MV so we need to convert our views using UView2d = typename Kokkos::View< - ScalarType **, Layout, DEVC, + ScalarType **, Layout, DEVICE, Kokkos::MemoryTraits >; using UView2dc = typename Kokkos::View< - const ScalarType **, Layout, DEVC, + const ScalarType **, Layout, DEVICE, Kokkos::MemoryTraits >; UView2dc X2d(X.data(), X.extent(0), 1); UView2d Y2d(Y.data(), Y.extent(0), 1), @@ -153,8 +153,8 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { ///// ///// It takes L and U and the stores U^inv L^inv X in Y // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, + virtual void apply(const Kokkos::View &X, + const Kokkos::View &Y, const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index de7c56bb2e..08f41eefbb 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -710,12 +710,10 @@ struct SpilukTest { template static void run_test_spiluk_precond() { - // Test using par_ilut as a preconditioner + // Test using spiluk as a preconditioner // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? // Create a diagonally dominant sparse matrix to test: - // par_ilut settings max_iters, res_delta_stop, fill_in_limit, and - // async_update are all left as defaults using sp_matrix_type = std::conditional_t; constexpr auto nrows = 5000; From 47d5756711df8b0c4a8d5d05879a40dedfe79639 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 20 Feb 2024 13:40:40 -0700 Subject: [PATCH 17/17] formatting --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index d4dfc2ed42..b3b5dfa277 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -162,7 +162,8 @@ struct IlukWrap { // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - void multiply_subtract(const scalar_t &A, const scalar_t &B, scalar_t &C) const { + void multiply_subtract(const scalar_t &A, const scalar_t &B, + scalar_t &C) const { C -= A * B; } @@ -263,8 +264,7 @@ struct IlukWrap { } KOKKOS_INLINE_FUNCTION - void lset(const size_type block, - const ABlock &rhs) const { + void lset(const size_type block, const ABlock &rhs) const { auto lblock = lget(block); for (size_type i = 0; i < block_size; ++i) { for (size_type j = 0; j < block_size; ++j) { @@ -280,8 +280,7 @@ struct IlukWrap { } KOKKOS_INLINE_FUNCTION - void uset(const size_type block, - const ABlock &rhs) const { + void uset(const size_type block, const ABlock &rhs) const { auto ublock = uget(block); for (size_type i = 0; i < block_size; ++i) { for (size_type j = 0; j < block_size; ++j) { @@ -314,33 +313,32 @@ struct IlukWrap { const LBlock &B, CView &C) const { // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 - KokkosBatched::SerialGemm:: - invoke( - -1.0, A, B, 1.0, C); + KokkosBatched::SerialGemm< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Unblocked>::invoke( + -1.0, A, B, 1.0, C); } // lget KOKKOS_INLINE_FUNCTION LBlock lget(const size_type block) const { - return LBlock( - L_values.data() + (block * block_items), block_size, block_size); + return LBlock(L_values.data() + (block * block_items), block_size, + block_size); } // uget KOKKOS_INLINE_FUNCTION UBlock uget(const size_type block) const { - return UBlock( - U_values.data() + (block * block_items), block_size, block_size); + return UBlock(U_values.data() + (block * block_items), block_size, + block_size); } // aget KOKKOS_INLINE_FUNCTION ABlock aget(const size_type block) const { - return ABlock( - A_values.data() + (block * block_items), block_size, block_size); + return ABlock(A_values.data() + (block * block_items), block_size, + block_size); } // uequal