diff --git a/libs/core/algorithms/include/hpx/parallel/datapar/iterator_helpers.hpp b/libs/core/algorithms/include/hpx/parallel/datapar/iterator_helpers.hpp index fc9c73e02ed5..ee100a269249 100644 --- a/libs/core/algorithms/include/hpx/parallel/datapar/iterator_helpers.hpp +++ b/libs/core/algorithms/include/hpx/parallel/datapar/iterator_helpers.hpp @@ -11,6 +11,7 @@ #if defined(HPX_HAVE_DATAPAR) #include #include +#include #include #include #include @@ -133,6 +134,31 @@ namespace hpx::parallel::util::detail { } }; + template + struct datapar_loop_step>> + { + using V1 = traits::vector_pack_type_t; + using V = traits::vector_pack_type_t; + + template + HPX_HOST_DEVICE HPX_FORCEINLINE static constexpr void call1(F&& f, I& i) + { + V1 tmp(i); + HPX_INVOKE(f, tmp); + ++i; + } + + template + HPX_HOST_DEVICE HPX_FORCEINLINE static constexpr void callv(F&& f, I& i) + { + V tmp; + for (std::size_t e = 0; e != traits::size(tmp); ++e) + traits::set(tmp, e, static_cast(i + e)); + HPX_INVOKE(f, tmp); + i += traits::vector_pack_size_v; + } + }; + /////////////////////////////////////////////////////////////////////////// template struct datapar_loop_pred_step diff --git a/libs/core/algorithms/include/hpx/parallel/datapar/loop.hpp b/libs/core/algorithms/include/hpx/parallel/datapar/loop.hpp index 3fb7af885c25..98c93b2a845a 100644 --- a/libs/core/algorithms/include/hpx/parallel/datapar/loop.hpp +++ b/libs/core/algorithms/include/hpx/parallel/datapar/loop.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2007-2023 Hartmut Kaiser +// Copyright (c) 2007-2025 Hartmut Kaiser // // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -238,8 +239,12 @@ namespace hpx::parallel::util { }; /////////////////////////////////////////////////////////////////////// + template + struct datapar_loop_n; + template - struct datapar_loop_n + struct datapar_loop_n>> { using iterator_type = std::decay_t; using value_type = @@ -258,8 +263,9 @@ namespace hpx::parallel::util { { std::size_t len = count; + // clang-format off for (/* */; !detail::is_data_aligned(first) && len != 0; - --len) + --len) { datapar_loop_step::call1(f, first); } @@ -268,16 +274,18 @@ namespace hpx::parallel::util { for (auto len_v = static_cast(len - (size + 1)); - len_v > 0; - len_v -= static_cast(size), len -= size) + len_v > 0; + len_v -= static_cast(size), len -= size) { datapar_loop_step::callv(f, first); } + // clang-format on for (/* */; len != 0; --len) { datapar_loop_step::call1(f, first); } + return first; } else @@ -302,6 +310,51 @@ namespace hpx::parallel::util { } }; + template + struct datapar_loop_n>> + { + using V = traits::vector_pack_type_t; + + template + HPX_HOST_DEVICE HPX_FORCEINLINE static constexpr Iter call( + Iter first, std::size_t count, F&& f) + { + std::size_t len = count; + constexpr std::size_t size = traits::vector_pack_size_v; + + for (size_t i = first % size; i != 0 && len != 0; --i, --len) + { + datapar_loop_step::call1(f, first); + } + + // clang-format off + for (auto len_v = static_cast(len - (size + 1)); + len_v > 0; + len_v -= static_cast(size), len -= size) + { + datapar_loop_step::callv(f, first); + } + // clang-format on + + for (/* */; len != 0; --len) + { + datapar_loop_step::call1(f, first); + } + return first; + } + + template + HPX_HOST_DEVICE HPX_FORCEINLINE static constexpr Iter call( + Iter first, std::size_t count, CancelToken& tok, F&& f) + { + // check at the start of a partition only + if (tok.was_cancelled()) + return first; + + return call(first, count, HPX_FORWARD(F, f)); + } + }; + /////////////////////////////////////////////////////////////////////// template struct datapar_loop_n_ind @@ -323,8 +376,9 @@ namespace hpx::parallel::util { { std::size_t len = count; + // clang-format off for (/* */; !detail::is_data_aligned(first) && len != 0; - --len) + --len) { datapar_loop_step_ind::call1(f, first); } @@ -333,11 +387,12 @@ namespace hpx::parallel::util { for (auto len_v = static_cast(len - (size + 1)); - len_v > 0; - len_v -= static_cast(size), len -= size) + len_v > 0; + len_v -= static_cast(size), len -= size) { datapar_loop_step_ind::callv(f, first); } + // clang-format on for (/* */; len != 0; --len) { @@ -381,14 +436,16 @@ namespace hpx::parallel::util { constexpr std::size_t size = traits::vector_pack_size_v; + // clang-format off for (auto len_v = static_cast(len - (size + 1)); - len_v > 0; - len_v -= static_cast(size), len -= size) + len_v > 0; + len_v -= static_cast(size), len -= size) { datapar_loop_idx_step::callv(f, it, base_idx); std::advance(it, size); base_idx += size; } + // clang-format on for (/* */; len != 0; --len) { diff --git a/libs/core/algorithms/tests/unit/datapar_algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/datapar_algorithms/CMakeLists.txt index cd15949a449c..b17436167fcc 100644 --- a/libs/core/algorithms/tests/unit/datapar_algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/datapar_algorithms/CMakeLists.txt @@ -29,6 +29,7 @@ if(HPX_WITH_DATAPAR) foreach_datapar foreach_datapar_zipiter foreachn_datapar + for_loop_datapar generate_datapar generaten_datapar mismatch_binary_datapar diff --git a/libs/core/algorithms/tests/unit/datapar_algorithms/for_loop_datapar.cpp b/libs/core/algorithms/tests/unit/datapar_algorithms/for_loop_datapar.cpp new file mode 100644 index 000000000000..fae6e7d45b97 --- /dev/null +++ b/libs/core/algorithms/tests/unit/datapar_algorithms/for_loop_datapar.cpp @@ -0,0 +1,119 @@ +// Copyright (c) 2016-2025 Hartmut Kaiser +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/////////////////////////////////////////////////////////////////////////////// +unsigned int seed = std::random_device{}(); +std::mt19937 gen(seed); + +/////////////////////////////////////////////////////////////////////////////// +template +void test_for_loop_idx(ExPolicy&& policy) +{ + static_assert(hpx::is_execution_policy_v, + "hpx::is_execution_policy_v"); + + std::vector c(10007); + std::iota(std::begin(c), std::end(c), gen()); + + hpx::experimental::for_loop( + std::forward(policy), 0, int(c.size()), [&c](auto i) { + for (std::size_t e = 0; e < hpx::parallel::traits::size(i); ++e) + c[hpx::parallel::traits::get(i, e)] = 42; + }); + + // verify values + std::size_t count = 0; + std::for_each(std::begin(c), std::end(c), [&count](std::size_t v) -> void { + HPX_TEST_EQ(v, std::size_t(42)); + ++count; + }); + HPX_TEST_EQ(count, c.size()); +} + +template +void test_for_loop_idx_async(ExPolicy&& p) +{ + std::vector c(10007); + std::iota(std::begin(c), std::end(c), gen()); + + auto f = hpx::experimental::for_loop( + std::forward(p), 0, int(c.size()), [&c](auto i) { + for (std::size_t e = 0; e < hpx::parallel::traits::size(i); ++e) + c[hpx::parallel::traits::get(i, e)] = 42; + }); + f.wait(); + + // verify values + std::size_t count = 0; + std::for_each(std::begin(c), std::end(c), [&count](std::size_t v) -> void { + HPX_TEST_EQ(v, std::size_t(42)); + ++count; + }); + HPX_TEST_EQ(count, c.size()); +} + +void for_loop_test_idx() +{ + using namespace hpx::execution; + + test_for_loop_idx(simd); + test_for_loop_idx(par_simd); + + test_for_loop_idx_async(simd(task)); + test_for_loop_idx_async(par_simd(task)); +} + +/////////////////////////////////////////////////////////////////////////////// +int hpx_main(hpx::program_options::variables_map& vm) +{ + if (vm.count("seed")) + seed = vm["seed"].as(); + + std::cout << "using seed: " << seed << std::endl; + gen.seed(seed); + + for_loop_test_idx(); + + return hpx::local::finalize(); +} + +int main(int argc, char* argv[]) +{ + // add command line option which controls the random number generator seed + using namespace hpx::program_options; + options_description desc_commandline( + "Usage: " HPX_APPLICATION_STRING " [options]"); + + desc_commandline.add_options()("seed,s", value(), + "the random number generator seed to use for this run"); + + // By default this test should run on all available cores + std::vector const cfg = {"hpx.os_threads=all"}; + + // Initialize and run HPX + hpx::local::init_params init_args; + init_args.desc_cmdline = desc_commandline; + init_args.cfg = cfg; + + HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0, + "HPX main exited with non-zero status"); + + return hpx::util::report_errors(); +} diff --git a/libs/core/execution/include/hpx/execution/traits/detail/eve/vector_pack_get_set.hpp b/libs/core/execution/include/hpx/execution/traits/detail/eve/vector_pack_get_set.hpp index 1e265b5d2c10..a72866ca12fa 100644 --- a/libs/core/execution/include/hpx/execution/traits/detail/eve/vector_pack_get_set.hpp +++ b/libs/core/execution/include/hpx/execution/traits/detail/eve/vector_pack_get_set.hpp @@ -9,25 +9,50 @@ #include #if defined(HPX_HAVE_DATAPAR_EVE) + +#include +#include +#include +#include + #include namespace hpx::parallel::traits { /////////////////////////////////////////////////////////////////////// - template + template )> HPX_HOST_DEVICE HPX_FORCEINLINE auto get( Vector& vec, std::size_t index) noexcept { return vec.get(index); } + template )> + HPX_HOST_DEVICE HPX_FORCEINLINE auto get( + Scalar& sc, [[maybe_unused]] std::size_t index) noexcept + { + HPX_ASSERT(index == 0); + return sc; + } + /////////////////////////////////////////////////////////////////////// - template + template )> HPX_HOST_DEVICE HPX_FORCEINLINE auto set( Vector& vec, std::size_t index, T val) noexcept { vec.set(index, val); } + + template )> + HPX_HOST_DEVICE HPX_FORCEINLINE auto set( + Scalar& sc, [[maybe_unused]] std::size_t index, T val) noexcept + { + HPX_ASSERT(index == 0); + sc = val; + } } // namespace hpx::parallel::traits #endif diff --git a/libs/core/execution/include/hpx/execution/traits/detail/simd/vector_pack_get_set.hpp b/libs/core/execution/include/hpx/execution/traits/detail/simd/vector_pack_get_set.hpp index cb8a30629a0b..33b9d5617fb2 100644 --- a/libs/core/execution/include/hpx/execution/traits/detail/simd/vector_pack_get_set.hpp +++ b/libs/core/execution/include/hpx/execution/traits/detail/simd/vector_pack_get_set.hpp @@ -10,26 +10,48 @@ #if defined(HPX_HAVE_DATAPAR_EXPERIMENTAL_SIMD) +#include +#include #include +#include #include namespace hpx::parallel::traits { /////////////////////////////////////////////////////////////////////// - template + template )> HPX_HOST_DEVICE HPX_FORCEINLINE auto get( Vector& vec, std::size_t index) noexcept { return vec[index]; } + template )> + HPX_HOST_DEVICE HPX_FORCEINLINE auto get( + Scalar& sc, [[maybe_unused]] std::size_t index) noexcept + { + HPX_ASSERT(index == 0); + return sc; + } + /////////////////////////////////////////////////////////////////////// - template + template )> HPX_HOST_DEVICE HPX_FORCEINLINE auto set( Vector& vec, std::size_t index, T val) noexcept { - datapar::experimental::set(vec, index, val); + vec[index] = val; + } + + template )> + HPX_HOST_DEVICE HPX_FORCEINLINE auto set( + Scalar& sc, [[maybe_unused]] std::size_t index, T val) noexcept + { + HPX_ASSERT(index == 0); + sc = val; } } // namespace hpx::parallel::traits diff --git a/libs/core/execution/include/hpx/execution/traits/detail/vc/vector_pack_get_set.hpp b/libs/core/execution/include/hpx/execution/traits/detail/vc/vector_pack_get_set.hpp index c4a2d03c48da..81ff9aa24527 100644 --- a/libs/core/execution/include/hpx/execution/traits/detail/vc/vector_pack_get_set.hpp +++ b/libs/core/execution/include/hpx/execution/traits/detail/vc/vector_pack_get_set.hpp @@ -9,12 +9,17 @@ #include #if defined(HPX_HAVE_DATAPAR_VC) +#include +#include + #include namespace hpx::parallel::traits { /////////////////////////////////////////////////////////////////////// - template + template || is_scalar_vector_pack_v)> HPX_HOST_DEVICE HPX_FORCEINLINE auto get( Vector& vec, std::size_t index) noexcept { @@ -22,7 +27,9 @@ namespace hpx::parallel::traits { } /////////////////////////////////////////////////////////////////////// - template + template || is_scalar_vector_pack_v)> HPX_HOST_DEVICE HPX_FORCEINLINE auto set( Vector& vec, std::size_t index, T val) noexcept { diff --git a/libs/core/execution/include/hpx/execution/traits/vector_pack_alignment_size.hpp b/libs/core/execution/include/hpx/execution/traits/vector_pack_alignment_size.hpp index 66932e7e3f9b..1ba7473779c1 100644 --- a/libs/core/execution/include/hpx/execution/traits/vector_pack_alignment_size.hpp +++ b/libs/core/execution/include/hpx/execution/traits/vector_pack_alignment_size.hpp @@ -88,6 +88,15 @@ namespace hpx::parallel::traits { template inline constexpr std::size_t vector_pack_size_v = vector_pack_size::value; + + //////////////////////////////////////////////////////////////////////////// + template || + is_scalar_vector_pack_v>> + constexpr std::size_t size(Pack) noexcept + { + return vector_pack_size_v; + } } // namespace hpx::parallel::traits #if !defined(__CUDACC__) diff --git a/libs/core/mpi_base/include/hpx/mpi_base/mpi_environment.hpp b/libs/core/mpi_base/include/hpx/mpi_base/mpi_environment.hpp index 384b993b35b0..b6ea8d17ab56 100644 --- a/libs/core/mpi_base/include/hpx/mpi_base/mpi_environment.hpp +++ b/libs/core/mpi_base/include/hpx/mpi_base/mpi_environment.hpp @@ -44,7 +44,8 @@ namespace hpx::util { static std::string get_processor_name(); static MPI_Datatype type_contiguous(size_t nbytes); - static MPI_Request isend(void* address, size_t size, int rank, int tag); + static MPI_Request isend( + void const* address, size_t size, int rank, int tag); static MPI_Request irecv(void* address, size_t size, int rank, int tag); struct HPX_CORE_EXPORT scoped_lock diff --git a/libs/core/mpi_base/src/mpi_environment.cpp b/libs/core/mpi_base/src/mpi_environment.cpp index a251fc579932..b09f6d8665d9 100644 --- a/libs/core/mpi_base/src/mpi_environment.cpp +++ b/libs/core/mpi_base/src/mpi_environment.cpp @@ -470,9 +470,9 @@ namespace hpx::util { } // Acknowledgement: code adapted from github.com/jeffhammond/BigMPI - MPI_Datatype mpi_environment::type_contiguous(size_t nbytes) + MPI_Datatype mpi_environment::type_contiguous(size_t const nbytes) { - size_t int_max = (std::numeric_limits::max)(); + constexpr int int_max = (std::numeric_limits::max)(); size_t c = nbytes / int_max; size_t r = nbytes % int_max; @@ -481,13 +481,14 @@ namespace hpx::util { HPX_ASSERT(r < int_max); MPI_Datatype chunks; - MPI_Type_vector(c, int_max, int_max, MPI_BYTE, &chunks); + MPI_Type_vector( + static_cast(c), int_max, int_max, MPI_BYTE, &chunks); MPI_Datatype remainder; - MPI_Type_contiguous(r, MPI_BYTE, &remainder); + MPI_Type_contiguous(static_cast(r), MPI_BYTE, &remainder); - MPI_Aint remdisp = (MPI_Aint) c * int_max; - int blocklengths[2] = {1, 1}; + MPI_Aint const remdisp = static_cast(c) * int_max; + constexpr int blocklengths[2] = {1, 1}; MPI_Aint displacements[2] = {0, remdisp}; MPI_Datatype types[2] = {chunks, remainder}; MPI_Datatype newtype; @@ -500,7 +501,7 @@ namespace hpx::util { } MPI_Request mpi_environment::isend( - void* address, size_t size, int rank, int tag) + void const* address, size_t size, int rank, int tag) { MPI_Request request; MPI_Datatype datatype; diff --git a/libs/full/performance_counters/include/hpx/performance_counters/query_counters.hpp b/libs/full/performance_counters/include/hpx/performance_counters/query_counters.hpp index 6599df3354f3..d154b022375c 100644 --- a/libs/full/performance_counters/include/hpx/performance_counters/query_counters.hpp +++ b/libs/full/performance_counters/include/hpx/performance_counters/query_counters.hpp @@ -88,19 +88,19 @@ namespace hpx::util { performance_counters::counter_values_array const& value); template - static void print_name_csv(Stream& out, std::string const& name); + void print_name_csv(Stream& out, std::string const& name); template - static void print_value_csv(Stream* out, + void print_value_csv(Stream* out, performance_counters::counter_info const& infos, performance_counters::counter_value const& value); template - static void print_value_csv(Stream* out, + void print_value_csv(Stream* out, performance_counters::counter_info const& infos, performance_counters::counter_values_array const& value); template - static void print_name_csv_short(Stream& out, std::string const& name); + void print_name_csv_short(Stream& out, std::string const& name); private: using mutex_type = hpx::mutex;