diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index c11c6bdc02..119941cebc 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -219,14 +219,22 @@ void runGS(const GS_Parameters& params) { KokkosSparse::Experimental::gauss_seidel_symbolic( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, params.graph_symmetric); - double symbolicTime = timer.seconds(); - std::cout << "\n*** Symbolic time: " << symbolicTime << '\n'; + double symbolicLaunchTime = timer.seconds(); + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double symbolicComputeTime = timer.seconds(); + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; timer.reset(); KokkosSparse::Experimental::gauss_seidel_numeric( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, params.graph_symmetric); - double numericTime = timer.seconds(); - std::cout << "\n*** Numeric time: " << numericTime << '\n'; + double numericLaunchTime = timer.seconds(); + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double numericComputeTime = timer.seconds(); + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; timer.reset(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { @@ -246,8 +254,14 @@ void runGS(const GS_Parameters& params) { true, true, 1.0, params.sweeps); break; } - double applyTime = timer.seconds(); - std::cout << "\n*** Apply time: " << applyTime << '\n'; + + double applyLaunchTime = timer.seconds(); + std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double applyComputeTime = timer.seconds(); + std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; + timer.reset(); kh.destroy_gs_handle(); // Now, compute the 2-norm of residual scalar_view_t res("Ax-b", nrows); diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index e4cfb4b047..0f03eb04b3 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1778,25 +1778,31 @@ class PointGaussSeidel { if (block_size == 1) { Kokkos::parallel_for( labelRegular, - team_policy_t((numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + team_policy_t((numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else if (gs.num_max_vals_in_l2 == 0) { Kokkos::parallel_for( labelBlock, - block_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + block_apply_team_policy_t( + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else { Kokkos::parallel_for( labelBigBlock, - bigblock_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + bigblock_apply_team_policy_t( + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } } @@ -1818,12 +1824,16 @@ class PointGaussSeidel { Kokkos::deep_copy(long_row_x, nnz_scalar_t()); Kokkos::parallel_for( labelLong, - longrow_apply_team_policy_t(numLongRows * teams_per_row, - longRowTeamSize), + Kokkos::Experimental::require( + longrow_apply_team_policy_t(numLongRows * teams_per_row, + longRowTeamSize), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::require( + range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); @@ -1874,7 +1884,9 @@ class PointGaussSeidel { if (numRegularRows) { Kokkos::parallel_for( labelShort, - range_pol(color_index_begin, color_index_end - numLongRows), + Kokkos::Experimental::require( + range_pol(color_index_begin, color_index_end - numLongRows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } if (numLongRows) { @@ -1890,13 +1902,18 @@ class PointGaussSeidel { Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; Kokkos::deep_copy(long_row_x, nnz_scalar_t()); - Kokkos::parallel_for(labelLong, - Kokkos::RangePolicy( - 0, numLongRows * par_per_row), - gs); + Kokkos::parallel_for( + labelLong, + Kokkos::Experimental::require( + Kokkos::RangePolicy( + 0, numLongRows * par_per_row), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::require( + range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows));