diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 74c50322865..20091075af7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -106,7 +106,8 @@ build/cuda110/nompi/gcc/cuda/release/shared: # fix gtest issue https://github.com/google/googletest/issues/3514 CXX_FLAGS: "-Wno-error=maybe-uninitialized" # disable spurious unused argument warning - EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" + # this is seemingly broken with CUDA 11 + # EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" # nvhpc and friends diff --git a/CMakeLists.txt b/CMakeLists.txt index 476df2f2914..d7ca00b59b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,6 +122,7 @@ include(cmake/build_type_helpers.cmake) # Load other CMake helpers include(cmake/build_helpers.cmake) include(cmake/install_helpers.cmake) +include(cmake/compiler_features.cmake) if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj") diff --git a/cmake/compiler_features.cmake b/cmake/compiler_features.cmake new file mode 100644 index 00000000000..f710016801d --- /dev/null +++ b/cmake/compiler_features.cmake @@ -0,0 +1,8 @@ +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + "#include + #include + static_assert(std::is_same::value, \"INSTANTIATE_UINT64\"); + int main() {}" + GKO_SIZE_T_IS_UINT64_T + FAIL_REGEX ".*INSTANTIATE_UINT64.#") diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 239613ef804..12ed608c322 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -273,7 +273,7 @@ endfunction(ginkgo_create_common_test_internal) function(ginkgo_create_common_device_test test_name) cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}") ginkgo_build_test_name(${test_name} test_target_name ${ARGN}) - if(GINKGO_BUILD_SYCL) + if(GINKGO_BUILD_SYCL AND NOT ("dpcpp" IN_LIST common_device_test_DISABLE_EXECUTORS)) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) # We need to use a new file to avoid sycl setting in other backends because add_sycl_to_target will change the source property. @@ -281,17 +281,17 @@ function(ginkgo_create_common_device_test test_name) gko_add_sycl_to_target(TARGET ${test_target_name}_dpcpp SOURCES ${test_name}.dp.cpp) target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel) endif() - if(GINKGO_BUILD_OMP) + if(GINKGO_BUILD_OMP AND NOT ("omp" IN_LIST common_device_test_DISABLE_EXECUTORS)) ginkgo_create_common_test_internal(${test_name} OmpExecutor omp ${ARGN}) target_link_libraries(${test_target_name}_omp PUBLIC OpenMP::OpenMP_CXX) endif() - if(GINKGO_BUILD_CUDA) + if(GINKGO_BUILD_CUDA AND NOT ("cuda" IN_LIST common_device_test_DISABLE_EXECUTORS)) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.cu COPYONLY) ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN}) target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor GKO_DEVICE_NAMESPACE=cuda) endif() - if(GINKGO_BUILD_HIP) + if(GINKGO_BUILD_HIP AND NOT ("hip" IN_LIST common_device_test_DISABLE_EXECUTORS)) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY) ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN}) diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt index e4e4fafe038..847be9b333a 100644 --- a/common/unified/CMakeLists.txt +++ b/common/unified/CMakeLists.txt @@ -5,6 +5,7 @@ set(UNIFIED_SOURCES components/fill_array_kernels.cpp components/format_conversion_kernels.cpp components/precision_conversion_kernels.cpp + components/range_minimum_query_kernels.cpp components/reduce_array_kernels.cpp distributed/assembly_kernels.cpp distributed/partition_helpers_kernels.cpp diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index 2fd51dd939b..900f34df3b7 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -1,9 +1,11 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause #include "core/components/fill_array_kernels.hpp" +#include + #include "common/unified/base/kernel_launch.hpp" @@ -25,6 +27,11 @@ void fill_array(std::shared_ptr exec, ValueType* array, GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); template GKO_DECLARE_FILL_ARRAY_KERNEL(bool); +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint16); +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint32); +#ifndef GKO_SIZE_T_IS_UINT64_T +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint64); +#endif template diff --git a/common/unified/components/range_minimum_query_kernels.cpp b/common/unified/components/range_minimum_query_kernels.cpp new file mode 100644 index 00000000000..de8e264bbd9 --- /dev/null +++ b/common/unified/components/range_minimum_query_kernels.cpp @@ -0,0 +1,184 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/range_minimum_query_kernels.hpp" + +#include + +#include "common/unified/base/kernel_launch.hpp" +#include "core/base/intrinsics.hpp" +#include "core/components/bit_packed_storage.hpp" +#include "core/components/range_minimum_query.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace range_minimum_query { + + +template +void compute_lookup_inside_blocks( + std::shared_ptr exec, const IndexType* values, + IndexType size, bit_packed_span& block_argmin, + IndexType* block_min, uint16* block_tree_index) +{ +#ifdef GKO_COMPILING_DPCPP + // The Intel SYCL compiler doesn't support constexpr initialization of + // non-trivial objects on the device. + GKO_NOT_IMPLEMENTED; +#else + using rmq_type = gko::range_minimum_query; + constexpr auto block_size = rmq_type::block_size; + using tree_index_type = std::decay_t; + using device_lut_type = typename rmq_type::block_lut_type; + using lut_type = typename rmq_type::block_lut_view_type; + static_assert( + lut_type::num_trees <= std::numeric_limits::max(), + "block type storage too small"); + // block_argmin stores multiple values per memory word, so we need to make + // sure that no two different threads write to the same memory location. + // The easiest way to do that is to have every thread handle all elements + // that map to the same memory location. + // The argmin inside a block is in the range [0, block_size - 1], so + // it needs ceil_log2_constexpr(block_size) bits. For efficiency + // reasons, we round that up to the next power of two. + // This expression is essentially bits_per_word / + // round_up_pow2_constexpr(ceil_log2_constexpr(block_size)), i.e. how + // many values are stored per word. + constexpr auto collation_width = + 1 << (std::decay_t::bits_per_word_log2 - + ceil_log2_constexpr(ceil_log2_constexpr(block_size))); + const device_lut_type lut{exec}; + run_kernel( + exec, + [] GKO_KERNEL(auto collated_block_idx, auto values, auto block_argmin, + auto block_min, auto block_tree_index, auto lut, + auto size) { + // we need to put this here because some compilers interpret capture + // rules around constexpr incorrectly + constexpr auto block_size = rmq_type::block_size; + constexpr auto infinity = std::numeric_limits::max(); + const auto num_blocks = ceildiv(size, block_size); + for (auto block_idx = collated_block_idx * collation_width; + block_idx < + std::min((collated_block_idx + 1) * collation_width, + num_blocks); + block_idx++) { + const auto i = block_idx * block_size; + IndexType local_values[block_size]; + int argmin = 0; +#pragma unroll + for (int local_i = 0; local_i < block_size; local_i++) { + // use "infinity" as sentinel for minimum computations + local_values[local_i] = + local_i + i < size ? values[local_i + i] : infinity; + if (local_values[local_i] < local_values[argmin]) { + argmin = local_i; + } + } + const auto tree_number = lut->compute_tree_index(local_values); + const auto min = local_values[argmin]; + block_argmin.set(block_idx, argmin); + block_min[block_idx] = min; + block_tree_index[block_idx] = + static_cast(tree_number); + } + }, + ceildiv(ceildiv(size, block_size), collation_width), values, + block_argmin, block_min, block_tree_index, lut.get(), size); +#endif +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL); + + +template +void compute_lookup_across_blocks( + std::shared_ptr exec, const IndexType* block_min, + IndexType num_blocks, + device_range_minimum_query_superblocks& superblocks) +{ +#ifdef GKO_COMPILING_DPCPP + GKO_NOT_IMPLEMENTED; +#else + if (num_blocks < 2) { + return; + } + using superblock_type = device_range_minimum_query_superblocks; + using storage_type = typename superblock_type::storage_type; + // we need to collate all writes that target the same memory word in a + // single thread + constexpr auto level0_collation_width = sizeof(storage_type) * CHAR_BIT; + // initialize the first level of blocks + run_kernel( + exec, + [] GKO_KERNEL(auto collated_i, auto block_min, auto superblocks, + auto num_blocks) { + constexpr auto infinity = std::numeric_limits::max(); + for (auto i = collated_i * level0_collation_width; + i < std::min((collated_i + 1) * level0_collation_width, + num_blocks); + i++) { + const auto min1 = block_min[i]; + const auto min2 = + i + 1 < num_blocks ? block_min[i + 1] : infinity; + // we need to use <= here to make sure ties always break to the + // left + superblocks.set_block_argmin(0, i, min1 <= min2 ? 0 : 1); + } + }, + ceildiv(num_blocks, level0_collation_width), block_min, superblocks, + num_blocks); + // we computed argmins for blocks of size 2, now recursively combine them. + const auto num_levels = superblocks.num_levels(); + for (int block_level = 1; block_level < num_levels; block_level++) { + const auto block_size = + superblock_type::block_size_for_level(block_level); + // we need block_level + 1 bits to represent values of size block_size + // and round up to the next power of two + const auto collation_width = + level0_collation_width / round_up_pow2(block_level + 1); + run_kernel( + exec, + [] GKO_KERNEL(auto collated_i, auto block_level, auto block_min, + auto superblocks, auto num_blocks, + auto collation_width) { + const auto block_size = + superblock_type::block_size_for_level(block_level); + for (auto i = collated_i * collation_width; + i < std::min((collated_i + 1) * collation_width, + num_blocks); + i++) { + const auto i2 = i + block_size / 2; + const auto argmin1 = + i + superblocks.block_argmin(block_level - 1, i); + const auto argmin2 = + i2 < num_blocks + ? i2 + superblocks.block_argmin(block_level - 1, i2) + : argmin1; + const auto min1 = block_min[argmin1]; + const auto min2 = block_min[argmin2]; + // we need to use <= here to make sure + // ties always break to the left + superblocks.set_block_argmin( + block_level, i, + min1 <= min2 ? argmin1 - i : argmin2 - i); + } + }, + ceildiv(num_blocks, collation_width), block_level, block_min, + superblocks, num_blocks, collation_width); + } +#endif +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_LARGE_KERNEL); + + +} // namespace range_minimum_query +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index a4e5647540f..54ce8fb59e6 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -42,6 +42,7 @@ target_sources(${ginkgo_core} base/segmented_array.cpp base/timer.cpp base/version.cpp + components/range_minimum_query.cpp config/config.cpp config/config_helper.cpp config/property_tree.cpp diff --git a/core/base/array.cpp b/core/base/array.cpp index a41f7c07e55..51fa4b34bb1 100644 --- a/core/base/array.cpp +++ b/core/base/array.cpp @@ -1,9 +1,11 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause #include "ginkgo/core/base/array.hpp" +#include + #include #include "core/base/array_access.hpp" @@ -89,6 +91,11 @@ ValueType reduce_add(const array& input_arr, #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value) GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL); +template GKO_DECLARE_ARRAY_FILL(uint16); +template GKO_DECLARE_ARRAY_FILL(uint32); +#ifndef GKO_SIZE_T_IS_UINT64_T +template GKO_DECLARE_ARRAY_FILL(uint64); +#endif #define GKO_DECLARE_ARRAY_REDUCE_ADD(_type) \ diff --git a/core/base/batch_instantiation.hpp b/core/base/batch_instantiation.hpp index 5fb08180a31..d2f305f6df0 100644 --- a/core/base/batch_instantiation.hpp +++ b/core/base/batch_instantiation.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/core/base/intrinsics.hpp b/core/base/intrinsics.hpp new file mode 100644 index 00000000000..1ab50d1e027 --- /dev/null +++ b/core/base/intrinsics.hpp @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_BASE_INTRINSICS_HPP_ +#define GKO_CORE_BASE_INTRINSICS_HPP_ + + +#include + +// MSVC needs different intrinsics +#ifdef _MSC_VER +#include + +#pragma intrinsic(_BitScanForward, _BitScanForward64, _BitScanReverse, \ + _BitScanReverse64) +#endif + + +namespace gko { +namespace detail { + + +/** + * Returns the index of the highest bit set in this bitmask. + * The least significant bit has index 0. + */ +GKO_ATTRIBUTES GKO_INLINE int find_highest_bit(uint32 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return 31 - __clz(static_cast(bitmask)); +#elif defined(_MSC_VER) + unsigned long index{}; + _BitScanReverse(&index, bitmask); + return index; +#else + return 31 - __builtin_clz(bitmask); +#endif +} + + +/** @copydoc find_highest_bit(uint32) */ +GKO_ATTRIBUTES GKO_INLINE int find_highest_bit(uint64 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return 63 - __clzll(static_cast(bitmask)); +#elif defined(_MSC_VER) + unsigned long index{}; + _BitScanReverse64(&index, bitmask); + return index; +#else + return 63 - __builtin_clzll(bitmask); +#endif +} + + +/** + * Returns the index of the lowest bit set in this bitmask. + * The least significant bit has index 0. + */ +GKO_ATTRIBUTES GKO_INLINE int find_lowest_bit(uint32 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __ffs(static_cast(bitmask)) - 1; +#elif defined(_MSC_VER) + unsigned long index{}; + _BitScanForward(&index, bitmask); + return index; +#else + return __builtin_ffs(bitmask) - 1; +#endif +} + + +/** @copydoc find_lowest_bit(uint32) */ +GKO_ATTRIBUTES GKO_INLINE int find_lowest_bit(uint64 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __ffsll(static_cast(bitmask)) - 1; +#elif defined(_MSC_VER) + unsigned long index{}; + _BitScanForward64(&index, bitmask); + return index; +#else + return __builtin_ffsll(bitmask) - 1; +#endif +} + + +} // namespace detail +} // namespace gko + +#endif // GKO_CORE_BASE_INTRINSICS_HPP_ diff --git a/core/components/bit_packed_storage.hpp b/core/components/bit_packed_storage.hpp new file mode 100644 index 00000000000..78777c1d2d2 --- /dev/null +++ b/core/components/bit_packed_storage.hpp @@ -0,0 +1,302 @@ +// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_COMPONENTS_BIT_PACKED_STORAGE_HPP_ +#define GKO_CORE_COMPONENTS_BIT_PACKED_STORAGE_HPP_ + +#include +#include +#include + +#include + +#include "core/base/intrinsics.hpp" + +namespace gko { + + +/** + * Computes the rounded-up binary logarithm of a number in constexpr fashion. + * In performance-critical runtime contexts, use ceil_log2 instead. + */ +template +constexpr int ceil_log2_constexpr(T value) +{ + if (value == 1) { + return 0; + } + return 1 + ceil_log2_constexpr((value + 1) / 2); +} + + +/** + * Computes the next larger (or equal) power of two for a number in constexpr + * fashion. + */ +template +constexpr int round_up_pow2_constexpr(T value) +{ + return T{1} << ceil_log2_constexpr(value); +} + + +/** + * Computes the rounded-up binary logarithm of a number. + */ +template +constexpr int ceil_log2(T value) +{ + assert(value >= 1); + return value > 1 ? detail::find_highest_bit( + static_cast>(value - 1)) + + 1 + : 0; +} + + +/** + * Computes the rounded-down binary logarithm of a number. + */ +template +constexpr int floor_log2(T value) +{ + assert(value >= 1); + return detail::find_highest_bit( + static_cast>(value)); +} + + +/** + * Computes the next larger (or equal) power of two for a number. + */ +template +constexpr int round_up_pow2(T value) +{ + return T{1} << ceil_log2(value); +} + + +/** + * A compact representation of an array of unsigned integers that can be + * represented using num_bits bits, e.g. values in [0, 2^num_bits). Each integer + * gets stored using round_up_pow2(num_bits) bits inside a StorageType word. + * This storage layout allows for fast division-less access to values, which a + * non-power-of-two number of bits would make hard otherwise. + * The cass is a non-owning view. + * + * @tparam ValueType the type used to represent values in the span + * @tparam IndexType the type used to represent indices in the span + * @tparam StorageType the type used to internally represent the values. It + * needs to be large enough to represent all values to be + * stored. + */ +template +class bit_packed_span { + static_assert(std::is_unsigned_v); + +public: + using value_type = ValueType; + using index_type = IndexType; + using storage_type = StorageType; + /** How many bits are available in StorageType */ + constexpr static int bits_per_word = sizeof(storage_type) * CHAR_BIT; + /** Binary logarithm of bits_per_word */ + constexpr static int bits_per_word_log2 = + ceil_log2_constexpr(bits_per_word); + + /** + * Returns the binary logarithm of the number of bits that should be used + * internally to store num_bits bits. This is used to allow faster accesses + * without the need for arbitrary integer division, since a division by + * bits_per_value can be represented using a shift by bits_per_value_log2. + */ + constexpr static int bits_per_value_log2(int num_bits) + { + return ceil_log2(num_bits); + } + + /** + * Returns the binary logarithm of the number of values can be stored inside + * a single storage_type word. This gets used to avoid integer divisions in + * favor of faster bit shifts. + */ + constexpr static int values_per_word_log2(int num_bits) + { + return bits_per_word_log2 - bits_per_value_log2(num_bits); + } + + /** + * Computes how many storage_type words will be necessary to store size + * values requiring num_bits bits. + * + * @param size The number of values to store + * @param num_bits The number of bits necessary to store values inside this + * span. This means that all values need to be in the range + * [0, 2^num_bits). + */ + constexpr static index_type storage_size(index_type size, int num_bits) + { + const auto shift = values_per_word_log2(num_bits); + const auto div = storage_type{1} << shift; + return (size + div - 1) >> shift; + } + + /** + * Sets a value inside the span, assuming that value was zero before. + * This can be used if the underlying storage was zero-initialized. + * + * @param i The index to write to + * @param value The value to write. It needs to be in [0, 2^num_bits). + */ + constexpr void set_from_zero(index_type i, value_type value) + { + assert(value >= 0); + assert(value <= static_cast(mask_)); + const auto [block, shift] = get_block_and_shift(i); + data_[block] |= static_cast(value) << shift; + } + + /** + * Clears a value inside the span, setting it to zero. + * + * @param i The index to clear + */ + constexpr void clear(index_type i) + { + const auto [block, shift] = get_block_and_shift(i); + data_[block] &= ~(mask_ << shift); + } + + /** + * Sets a value inside the span. + * + * @param i The index to write to + * @param value The value to write. It needs to be in [0, 2^num_bits). + */ + constexpr void set(index_type i, value_type value) + { + clear(i); + set_from_zero(i, value); + } + + /** + * Returns a value from the span. + * + * @param i The index to read from + * @return the read value + */ + constexpr value_type get(index_type i) const + { + const auto [block, shift] = get_block_and_shift(i); + return static_cast((data_[block] >> shift) & mask_); + } + + /** + * Construct a span from the underlying data. + * + * @param data the data array of size `storage_size(size, num_bits)`. + * @param num_bits the number of bits necessary to store a single value. + * @param size the number of elements in the span. + * + */ + explicit constexpr bit_packed_span(storage_type* data, int num_bits, + index_type size) + : data_{data}, + size_{size}, + mask_{(storage_type{1} << num_bits) - 1}, + bits_per_value_{round_up_pow2(num_bits)}, + values_per_word_log2_{values_per_word_log2(num_bits)}, + local_index_mask_{(1 << values_per_word_log2_) - 1} + { + // ignore the sign bit in this comparison + assert(num_bits < bits_per_word); + } + +private: + constexpr std::pair get_block_and_shift(index_type i) const + { + assert(i >= 0); + assert(i < size_); + return std::make_pair(i >> values_per_word_log2_, + (i & local_index_mask_) * bits_per_value_); + } + + storage_type* data_; + index_type size_; + storage_type mask_; + int bits_per_value_; + int values_per_word_log2_; + int local_index_mask_; +}; + + +/** + * An array of size unsigned integers stored in a compact fashion with num_bits + * bits each. This is a statically-sized, owning equivalent of + * bit_packed_span. + + * @tparam num_bits The number of bits necessary to store a single value in the + * array. Values need to be in the range [0, 2^num_bits). + * @tparam size The number of values to store in the array. + */ +template +class bit_packed_array { +public: + using storage_type = uint32; + constexpr static int bits_per_word = sizeof(storage_type) * CHAR_BIT; + constexpr static int bits_per_value = round_up_pow2_constexpr(num_bits); + constexpr static int values_per_word = bits_per_word / bits_per_value; + constexpr static int num_words = + (size + values_per_word - 1) / values_per_word; + // we need to shift by less than 32 to avoid UB + constexpr static storage_type mask = + (storage_type{2} << (bits_per_value - 1)) - storage_type{1}; + // ignore the sign bit in this comparison + static_assert(num_bits < bits_per_word); + static_assert(num_bits > 0); + static_assert(size >= 0); + + /** Zero-initializes all values in the array. */ + constexpr bit_packed_array() : data_{} {} + + /** @copydoc bit_packed_span::set_from_zero */ + constexpr void set_from_zero(int i, int value) + { + assert(value >= 0); + assert(value <= mask); + data_[i / values_per_word] |= + value << ((i % values_per_word) * bits_per_value); + } + + /** @copydoc bit_packed_span::clear */ + constexpr void clear(int i) + { + data_[i / values_per_word] &= + ~(mask << ((i % values_per_word) * bits_per_value)); + } + + /** @copydoc bit_packed_span::set */ + constexpr void set(int i, int value) + { + clear(i); + set_from_zero(i, value); + } + + /** @copydoc bit_packed_span::get */ + constexpr int get(int i) const + { + return (data_[i / values_per_word] >> + (i % values_per_word) * bits_per_value) & + mask; + } + +private: + storage_type data_[num_words]; +}; + + +} // namespace gko + + +#endif // GKO_CORE_COMPONENTS_BIT_PACKED_STORAGE_HPP_ diff --git a/core/components/range_minimum_query.cpp b/core/components/range_minimum_query.cpp new file mode 100644 index 00000000000..07cb69fcdff --- /dev/null +++ b/core/components/range_minimum_query.cpp @@ -0,0 +1,77 @@ +// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/range_minimum_query.hpp" + +#include + +#include "core/components/range_minimum_query_kernels.hpp" + + +namespace gko { +namespace { + + +GKO_REGISTER_OPERATION(compute_lookup_inside_blocks, + range_minimum_query::compute_lookup_inside_blocks); +GKO_REGISTER_OPERATION(compute_lookup_across_blocks, + range_minimum_query::compute_lookup_across_blocks); + + +} // namespace + + +template +range_minimum_query::range_minimum_query(array data) + : num_blocks_{static_cast( + ceildiv(static_cast(data.get_size()), block_size))}, + lut_{data.get_executor()}, + block_tree_indices_{data.get_executor(), + static_cast(num_blocks_)}, + block_argmin_storage_{ + data.get_executor(), + static_cast(block_argmin_view_type::storage_size( + static_cast(num_blocks_), block_argmin_num_bits))}, + block_min_{data.get_executor(), static_cast(num_blocks_)}, + superblock_storage_{data.get_executor(), + static_cast( + superblock_view_type::storage_size(num_blocks_))}, + values_{std::move(data)} +{ + const auto exec = values_.get_executor(); + auto block_argmin = block_argmin_view_type{ + block_argmin_storage_.get_data(), block_argmin_num_bits, num_blocks_}; + exec->run(make_compute_lookup_inside_blocks( + values_.get_const_data(), static_cast(values_.get_size()), + block_argmin, block_min_.get_data(), block_tree_indices_.get_data())); + auto superblocks = + superblock_view_type{block_min_.get_const_data(), + superblock_storage_.get_data(), num_blocks_}; + exec->run(make_compute_lookup_across_blocks(block_min_.get_const_data(), + num_blocks_, superblocks)); +} + + +template +typename range_minimum_query::view_type +range_minimum_query::get() const +{ + return device_range_minimum_query{ + values_.get_const_data(), + block_min_.get_const_data(), + block_argmin_storage_.get_const_data(), + block_tree_indices_.get_const_data(), + superblock_storage_.get_const_data(), + lut_.get(), + static_cast(values_.get_size())}; +} + + +#define GKO_DEFINE_DEVICE_RANGE_MINIMUM_QUERY(IndexType) \ + class range_minimum_query + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_DEVICE_RANGE_MINIMUM_QUERY); + + +} // namespace gko diff --git a/core/components/range_minimum_query.hpp b/core/components/range_minimum_query.hpp new file mode 100644 index 00000000000..eccbeecbc58 --- /dev/null +++ b/core/components/range_minimum_query.hpp @@ -0,0 +1,707 @@ +// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_COMPONENTS_RANGE_MINIMUM_QUERY_HPP_ +#define GKO_CORE_COMPONENTS_RANGE_MINIMUM_QUERY_HPP_ + +#include +#include +#include + +#include +#include +#include + +#include "core/base/index_range.hpp" +#include "core/base/intrinsics.hpp" +#include "core/components/bit_packed_storage.hpp" + +namespace gko { +namespace detail { + + +/** + * Helper structure that contains information about the set of Cartesian trees + * on num_nodes nodes. + */ +template +struct cartesian_tree { + /** + * A pre-computed lookup table for the recursively defined Ballot numbers. + * Donald E. Knuth, "Generating All Trees ā€” History of Combinatorial + * Generation" In: The Art of Computer Programming. Vol. 4, Fasc. 4 + * Definition: + * C(0,0) = 1, + * C(p,q) = C(p,qāˆ’1) + C(pāˆ’1,q) if 0 <= p <= q != 0 + * C(p,q) = 0 elsewhere + */ + struct ballot_number_lookup { + constexpr static int size = num_nodes + 1; + constexpr static int size2 = size * size; + + /** Builds the lookup table */ + constexpr ballot_number_lookup() : lut{} + { + for (int p = 0; p < size; p++) { + for (int q = 0; q < size; q++) { + int value{}; + if (p == 0 && q == 0) { + value = 1; + } else if (p <= q && q > 0) { + value = lut[p * size + (q - 1)]; + if (p > 0) { + value += lut[(p - 1) * size + q]; + } + } + lut[p * size + q] = value; + } + } + } + + /** Returns the ballot number C(p, q). */ + constexpr int operator()(int p, int q) const + { + if (p < 0 || q < 0) { + return 0; + } + assert(p <= num_nodes && q <= num_nodes); + return lut[p * size + q]; + } + + /** Returns the Catalan number C_s = C(s, s). */ + constexpr int operator()(int s) const { return operator()(s, s); } + + int lut[size2]; + }; + + constexpr static ballot_number_lookup ballot_number{}; + + /** + * The number of Cartesian trees on num_nodes nodes. It is given by the + * num_nodes'th Catalan number, which grows asymptotically like 4^num_nodes. + */ + constexpr static int num_trees = ballot_number(num_nodes); + + /** + * Computes the index of the Cartesian tree for the given values. + * + * @param values The values to build the Cartesian tree for. Only the first + * cur_num_nodes values are considered. + * @param cur_num_nodes How many values from values to consider as input. + * @return the tree index in range [0, C_(cur_num_nodes)) where C_s is the + * s'th Catalan number. + */ + constexpr static int compute_tree_index( + const int values[num_nodes], + const ballot_number_lookup& ballot_number_lut, + int cur_num_nodes = num_nodes) + { + // build cartesian tree left-to-right and traverse ballot number + // triangle simultaneously + // This is Algorithm 1 from J. Fischer and V. Heun, "Space-Efficient + // Preprocessing Schemes for Range Minimum Queries on Static Arrays," + // doi: 10.1137/090779759. + int rightmost[num_nodes + 1]{}; + rightmost[0] = std::numeric_limits::lowest(); + int number = 0; + int q = cur_num_nodes; + for (int i = 0; i < cur_num_nodes; i++) { + while (rightmost[q + i - cur_num_nodes] > values[i]) { + number += ballot_number_lut(cur_num_nodes - (i + 1), q); + q--; + } + rightmost[q + i + 1 - cur_num_nodes] = values[i]; + } + return number; + } + + /** + * For each possible Cartesian tree on num_nodes nodes, this builds an + * array of values that has that Cartesian tree. + * This means that compute_tree_index(representatives[i]) == i. + */ + constexpr static std::array + compute_tree_representatives() + { + // all_representatives[i] contains the representatives for all Cartesian + // trees on i nodes, the trailing entries of this array are + // zero-initialized. + // all_representatives[i][j] contains the representative for the + // Cartesian tree with i nodes and index j. + std::array, num_nodes + 1> + all_representatives{}; + + // Recursively combine representatives for smaller inputs to larger + // representatives. + for (int cur_num_nodes = 1; cur_num_nodes <= num_nodes; + cur_num_nodes++) { + // The root node of a Cartesian tree is its minimum, so we can + // enumerate all possible Cartesian trees by choosing all possible + // minimum positions, and the left and right subtrees/left and right + // halves around the minimum can be chosen independently. + // This enumeration does not list representatives in order of their + // tree index, so we need to use compute_tree_index internally. + for (int min_pos = 0; min_pos < cur_num_nodes; min_pos++) { + const auto left_size = min_pos; + const auto right_size = cur_num_nodes - min_pos - 1; + const auto left_count = ballot_number(left_size); + const auto right_count = ballot_number(right_size); + // We go through all possible pairs of representatives for the + // left and right subtree + for (int left_idx = 0; left_idx < left_count; left_idx++) { + const auto& left_rep = + all_representatives[left_size][left_idx]; + for (int right_idx = 0; right_idx < right_count; + right_idx++) { + const auto& right_rep = + all_representatives[right_size][right_idx]; + int local_rep[num_nodes]{}; + // The minimum is the smallest with value 0 + local_rep[min_pos] = 0; + // The left subtree gets increased by 1 so its minimum + // is larger than the overall minimum, and copied to the + // subrange left of the minimum + for (int i = 0; i < left_size; i++) { + local_rep[i] = left_rep[i] + 1; + } + // The right subtree gets increased and copied to the + // right of the minimum + for (int i = 0; i < right_size; i++) { + local_rep[i + min_pos + 1] = right_rep[i] + 1; + } + // The we can figure out what the tree index of this + // representative is... + const auto tree_number = compute_tree_index( + local_rep, ballot_number, cur_num_nodes); + auto& output_rep = + all_representatives[cur_num_nodes][tree_number]; + // ... and copy over its values to the right location + for (int i = 0; i < cur_num_nodes; i++) { + output_rep[i] = local_rep[i]; + } + } + } + } + } + return all_representatives[num_nodes]; + } +}; + + +} // namespace detail + + +template +class device_block_range_minimum_query_lookup_table { +public: + using tree = detail::cartesian_tree; + // how many trees does the lookup table (LUT) contain? + constexpr static int num_trees = tree::num_trees; + // how many bits do we need theoretically for this block? + constexpr static int num_bits = ceil_log2_constexpr(block_size); + + constexpr device_block_range_minimum_query_lookup_table() : lookup_table{} + { + const auto& representatives = tree::compute_tree_representatives(); + for (int tree = 0; tree < num_trees; tree++) { + const auto& rep = representatives[tree]; + for (int first = 0; first < block_size; first++) { + for (int last = first; last < block_size; last++) { + int min_index = first; + for (int i = first + 1; i <= last; i++) { + if (rep[i] < rep[min_index]) { + min_index = i; + } + } + lookup_table[tree].set(first + block_size * last, + min_index); + } + } + } + } + + /** Computes the tree index of the Cartesian tree for the given values. */ + template + constexpr int compute_tree_index(const T values[block_size]) const + { + // build cartesian tree left-to-right and traverse ballot number + // triangle in parallel + T rightmost[block_size + 1]{}; + rightmost[0] = std::numeric_limits::lowest(); + int number = 0; + int q = block_size; + for (int i = 0; i < block_size; i++) { + while (rightmost[q + i - block_size] > values[i]) { + number += ballot_number(block_size - (i + 1), q); + q--; + } + rightmost[q + i + 1 - block_size] = values[i]; + } + return number; + } + + /** + * Returns the range minimum for an array with the given Cartesian tree + * index in the range [first, last]. + * + * @param tree the tree index for the Cartesian tree. + * @param first the first index in the range. + * @param last the last index in the range. + * @return the range minimum, i.e. $\argmin_{i \in [first, last]}(values)$ + * where `compute_tree_index(values) == tree`. + */ + constexpr int lookup(int tree, int first, int last) const + { + return lookup_table[tree].get(first + block_size * last); + } + +private: + typename tree::ballot_number_lookup ballot_number; + bit_packed_array lookup_table[num_trees]; +}; + + +/** + * Represents a small block RMQ lookup table in device memory. + * It will be initialized on the host side and copied to the device. + * + * @tparam block_size the small block size to build the lookup table for. + */ +template +class block_range_minimum_query_lookup_table { +public: + using view_type = device_block_range_minimum_query_lookup_table; + + /** Initializes the lookup table in device memory for the given executor. */ + block_range_minimum_query_lookup_table(std::shared_ptr exec) + : data_{exec, sizeof(view_type)} + { + view_type lut{}; + exec->copy_from(exec->get_master(), 1, &lut, get()); + } + + /** Returns a pointer to the lookup table. */ + const view_type* get() const + { + return reinterpret_cast(data_.get_const_data()); + } + +private: + view_type* get() { return reinterpret_cast(data_.get_data()); } + + array data_; +}; + + +template +struct range_minimum_query_result { + IndexType argmin; + IndexType min; +}; + + +/** + * Provides a constant-time Range Minimum Query structure using n * log_2 n + * values of storage. + * Internally it stores the range minimum for every block of power-of-two size + * that fits inside the full range of indices. + * Any RMQ can then be answered by taking the minimum between the leftmost and + * rightmost block of the largest possible power-of-two size that fits inside + * the queried range. These power-of-two blocks cover every possible range. + * The query answers are stored in levels, with each level representing a block + * size, starting with block size 2 at level 0, block size 4 at level 1 etc. + * Level $i$ uses block size $2^(i + 1)$, so for block size $b$, we use level + * $\log_2(b) - 1$. For each level, we use a bit_packed_span to store the + * answers in a compact way, since the argmin inside a block ranges only up to + * that block's size. To simplify indexing, every block stores the range minimum + * query for every possible block start index in [0, size), even if that block + * exceeds the range of values. We implicitly pad the end of the value array + * with infinity values, which are never the minimum. + */ +template +class device_range_minimum_query_superblocks { +public: + using index_type = IndexType; + using storage_type = std::make_unsigned_t; + using query_result = range_minimum_query_result; + constexpr static auto index_type_bits = CHAR_BIT * sizeof(index_type); + + /** + * Initializes the superblock RMQ data structure based on the value array + * and argmin storage. + * + * @param values the value array + * @param storage the argmin storage array of size + * `storage_size(size)` + * @param size the number of values in the value array + */ + explicit constexpr device_range_minimum_query_superblocks( + const index_type* values, storage_type* storage, index_type size) + : values_{values}, storage_{storage}, size_{size} + {} + + /** Returns the value at index i. */ + constexpr index_type value(index_type i) const + { + assert(i >= 0); + assert(i < size()); + return values_[i]; + } + + /** + * Returns the level index which contains the superblocks covering the range + * minimum at the distance `last - first` between the query points. + * @param distance the distance `last - first` + */ + constexpr static int level_for_distance(index_type distance) + { + assert(distance >= 0); + return distance >= 2 ? floor_log2(distance) - 1 : 0; + } + + /** Returns the superblock size at the given level index. */ + constexpr static index_type block_size_for_level(int level) + { + assert(level >= 0); + return index_type{1} << (level + 1); + } + + /** + * Returns the range minimum in the range `[first, last]`. + * + * @param first the first index + * @param last the second index + * @return the min and argmin of the values in the range `[first, last]`. + */ + constexpr query_result query(index_type first, index_type last) const + { + assert(first >= 0); + assert(first <= last); + assert(last < size()); + const auto len = last - first; + if (len == 0) { + return query_result{first, value(first)}; + } + const auto level = level_for_distance(len); + const auto argmin1 = first + block_argmin(level, first); + const auto mid = last - block_size_for_level(level) + 1; + const auto argmin2 = mid + block_argmin(level, mid); + const auto min1 = value(argmin1); + const auto min2 = value(argmin2); + // we need <= here so the tie always breaks to the smaller argmin + return min1 <= min2 ? query_result{argmin1, min1} + : query_result{argmin2, min2}; + } + + /** + * Returns the argmin inside a block of power-of-two size. + * + * @param level the level index, which is the binary logarithm + * of the block size minus 1. + * @param index the start index of the block + * @return argmin of the range `[index, index + 2^(level + 1) - 1]`. + */ + constexpr index_type block_argmin(int level, size_type index) const + { + return get_level(level).get(index); + } + + /** + * Sets the argmin for a block of power-of-two size. + * + * @param level the level index, which is the binary logarithm + * of the block size minus 1. + * @param index the start index of the block + * @param value argmin of the range + * `[index, index + 2^(block_size_log2_m1 + 1) - 1]`. + */ + constexpr void set_block_argmin(int level, size_type index, + index_type value) + { + get_level(level).set(index, value); + } + + /** + * Computes the size-independent lookup array used to compute the storage + * offsets into the storage array. + * blocks of size 2 use 1 bit to store the argmin + * blocks of size 4 use 2 bits to store the argmin + * blocks of size 8 and 16 use 4 bits to store the argmin + * blocks of size 32 - 256 use 8 bits to store the argmin + * blocks of size 512 - 16384 use 16 bits to store the argmin + * ... + * Every level uses get_num_blocks() * number of bits values to store its + * argmin values. + */ + constexpr static std::array + compute_block_offset_lookup() + { + std::array result{}; + for (int i = 1; i <= index_type_bits; i++) { + const auto storage_multiplier = round_up_pow2_constexpr(i); + result[i] = result[i - 1] + storage_multiplier; + } + return result; + } + + /** + * Returns the storage offset where the given level begins. + * + * @param level the level index, which is the binary logarithm + * of the block size minus 1. + */ + constexpr int get_offset(int level) const + { + constexpr auto offsets = compute_block_offset_lookup(); + assert(level >= 0); + assert(level < index_type_bits); + return offsets[level] * get_num_blocks(); + } + + /** Returns the number of values in the value array. */ + constexpr index_type size() const { return size_; } + + /** Returns the number of levels necessary for the value array. */ + constexpr int num_levels() const { return num_levels(size()); } + + /** Returns the number of levels necessary for the given input size. */ + constexpr static int num_levels(index_type size) + { + return size > 1 ? (size > 2 ? ceil_log2(size) - 1 : 1) : 0; + } + + /** + * Returns the number of values of storage_type type to store the + * superblock argmins for the given number of values. + * + * @param size the size of the value array + * @return the number of storage_type values to store the superblocks. + */ + constexpr static index_type storage_size(index_type size) + { + return compute_block_offset_lookup()[num_levels(size)] * + get_num_blocks(size); + } + +private: + constexpr index_type get_num_blocks() const + { + return get_num_blocks(size_); + } + + constexpr static index_type get_num_blocks(index_type size) + { + return (size + index_type_bits - 1) / index_type_bits; + } + + constexpr bit_packed_span + get_level(int block_size_log2_m1) const + { + const auto values = storage_ + get_offset(block_size_log2_m1); + const int num_bits = round_up_pow2(block_size_log2_m1 + 1); + return bit_packed_span{ + values, num_bits, size_}; + } + + constexpr bit_packed_span get_level( + int block_size_log2_m1) + { + const auto values = storage_ + get_offset(block_size_log2_m1); + const int num_bits = round_up_pow2(block_size_log2_m1 + 1); + return bit_packed_span{ + values, num_bits, size_}; + } + + const index_type* values_; + storage_type* storage_; + index_type size_; +}; + + +/** + * Range Minimum Query data structure based on Cartesian tree-based lookups in + * small blocks and power-of-two superblock-based lookups for large blocks. + * This is a non-owning view. + * + * J. Fischer and V. Heun, "Space-Efficient Preprocessing Schemes for Range + * Minimum Queries on Static Arrays," doi: 10.1137/090779759. + */ +template +class device_range_minimum_query { +public: + using index_type = IndexType; + using block_lookup_type = + device_block_range_minimum_query_lookup_table; + using superblock_lookup_type = + device_range_minimum_query_superblocks; + using storage_type = typename superblock_lookup_type::storage_type; + using query_result = range_minimum_query_result; + + /** + * Constructs the RMQ data structure from its data arrays. + * + * @param values the value array + * @param block_min the block minimum array + * @param block_argmin the packed block argmin array + * @param block_tree_index the block tree index array + * @param superblock_storage the packed superblock argmin array + * @param block_lut the lookup table for RMQs inside small Cartesian trees + * @param size the number of elements in the value array + */ + explicit constexpr device_range_minimum_query( + const index_type* values, const index_type* block_min, + const uint32* block_argmin, const uint16* block_tree_index, + const storage_type* superblock_storage, + const block_lookup_type* block_lut, index_type size) + : num_blocks_{static_cast(ceildiv(size, block_size))}, + values_{values}, + block_tree_indices_{block_tree_index}, + block_argmin_{block_argmin, ceil_log2_constexpr(block_size), + num_blocks_}, + superblocks_{block_min, superblock_storage, num_blocks_}, + block_lut_{block_lut}, + size_{size} + {} + + /** + * Returns the range minimum in the range `[first, last]`. + * + * @param first the first index + * @param last the second index + * @return the min and argmin of the values in the range `[first, last]`. + */ + constexpr query_result query(index_type first, index_type last) const + { + assert(first >= 0); + assert(first <= last); + assert(last < size()); + // shortcut for trivial queries + if (first == last) { + return query_result{first, values_[first]}; + } + const auto first_block = first / block_size; + const auto last_block = last / block_size; + const auto first_block_base = first_block * block_size; + const auto first_local = first - first_block_base; + const auto last_block_base = last_block * block_size; + const auto last_local = last - last_block_base; + const auto first_block_tree_index = block_tree_indices_[first_block]; + const auto last_block_tree_index = block_tree_indices_[last_block]; + // both values in the same block + if (first_block == last_block) { + const auto argmin = + first_block_base + block_lut_->lookup(first_block_tree_index, + first_local, last_local); + return query_result{argmin, values_[argmin]}; + } + // both values in adjacent blocks + if (last_block == first_block + 1) { + // from first to the end of the block + const auto first_argmin = + first_block_base + block_lut_->lookup(first_block_tree_index, + first_local, + block_size - 1); + // from beginning of the block to last + const auto last_argmin = + last_block_base + + block_lut_->lookup(last_block_tree_index, 0, last_local); + const auto first_min = values_[first_argmin]; + const auto last_min = values_[last_argmin]; + return first_min <= last_min ? query_result{first_argmin, first_min} + : query_result{last_argmin, last_min}; + } + // general case: both values in different non-adjacent blocks + const auto first_full_block = + first_local == 0 ? first_block : first_block + 1; + const auto last_full_block = + last_local == block_size - 1 ? last_block : last_block - 1; + const auto full_block_result = + superblocks_.query(first_full_block, last_full_block); + const auto first_block_argmin = + first_block_base + block_lut_->lookup(first_block_tree_index, + first_local, block_size - 1); + const auto last_block_argmin = + last_block_base + + block_lut_->lookup(last_block_tree_index, 0, last_local); + const auto first_block_min = values_[first_block_argmin]; + const auto last_block_min = values_[last_block_argmin]; + auto result_min = + min(full_block_result.min, min(first_block_min, last_block_min)); + const auto get_full_block_result_argmin = [&] { + return (full_block_result.argmin * block_size + + block_argmin_.get(full_block_result.argmin)); + }; + auto result_argmin = first_block_min == result_min ? first_block_argmin + : full_block_result.min == result_min + ? get_full_block_result_argmin() + : last_block_argmin; + return query_result{result_argmin, result_min}; + } + + /** Returns the number of values in the value array. */ + constexpr index_type size() const { return size_; } + +private: + index_type num_blocks_; + const index_type* values_; + const uint16* block_tree_indices_; + bit_packed_span block_argmin_; + superblock_lookup_type superblocks_; + const block_lookup_type* block_lut_; + index_type size_; +}; + + +/** + * Owning version of device_range_minimum_query, which creates the necessary + * data from an input array automatically. + * + * @tparam IndexType the type of indices and values in the underlying array. + */ +template +class range_minimum_query { +public: + constexpr static int block_size = 8; + constexpr static int block_argmin_num_bits = + ceil_log2_constexpr(block_size); + using index_type = IndexType; + using view_type = device_range_minimum_query; + using block_lut_type = block_range_minimum_query_lookup_table; + using block_lut_view_type = typename block_lut_type::view_type; + using block_argmin_view_type = bit_packed_span; + using block_argmin_storage_type = + typename block_argmin_view_type::storage_type; + using superblock_view_type = + device_range_minimum_query_superblocks; + using superblock_storage_type = typename superblock_view_type::storage_type; + + /** + * Constructs the RMQ data structure from the given array. + * The data structure uses the array's executor for all internal arrays and + * kernel calls. + * + * @param data the value array + */ + range_minimum_query(array data); + + /** + * Returns the device_range_minimum_query view for the data, for use in + * kernels. + */ + view_type get() const; + +private: + index_type num_blocks_; + block_lut_type lut_; + array block_tree_indices_; + array block_argmin_storage_; + array block_min_; + array superblock_storage_; + array values_; +}; + + +} // namespace gko + +#endif // GKO_CORE_COMPONENTS_RANGE_MINIMUM_QUERY_HPP_ diff --git a/core/components/range_minimum_query_kernels.hpp b/core/components/range_minimum_query_kernels.hpp new file mode 100644 index 00000000000..b777dcb2e1b --- /dev/null +++ b/core/components/range_minimum_query_kernels.hpp @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_COMPONENTS_RANGE_MINIMUM_QUERY_KERNELS_HPP_ +#define GKO_CORE_COMPONENTS_RANGE_MINIMUM_QUERY_KERNELS_HPP_ + + +#include "core/components/range_minimum_query.hpp" + +#include + +#include +#include +#include + +#include "core/base/kernel_declaration.hpp" +#include "core/components/bit_packed_storage.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL(IndexType) \ + void compute_lookup_inside_blocks( \ + std::shared_ptr exec, const IndexType* values, \ + IndexType size, bit_packed_span& block_argmin, \ + IndexType* block_min, uint16* block_tree_index) + + +#define GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_LARGE_KERNEL(IndexType) \ + void compute_lookup_across_blocks( \ + std::shared_ptr exec, \ + const IndexType* block_min, IndexType num_blocks, \ + device_range_minimum_query_superblocks& superblocks) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL(IndexType); \ + template \ + GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_LARGE_KERNEL(IndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(range_minimum_query, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + +#endif // GKO_CORE_COMPONENTS_PRECISION_CONVERSION_KERNELS_HPP_ diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 437e1145c40..5ed2daa540c 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + #include #include @@ -15,6 +17,7 @@ #include "core/components/format_conversion_kernels.hpp" #include "core/components/precision_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/components/range_minimum_query_kernels.hpp" #include "core/components/reduce_array_kernels.hpp" #include "core/distributed/assembly_kernels.hpp" #include "core/distributed/index_map_kernels.hpp" @@ -247,6 +250,11 @@ template GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL(size_type); GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); template GKO_DECLARE_FILL_ARRAY_KERNEL(bool); +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint16); +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint32); +#ifndef GKO_SIZE_T_IS_UINT64_T +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint64); +#endif GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL); @@ -280,6 +288,18 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES); } // namespace components +namespace range_minimum_query { + + +GKO_STUB_INDEX_TYPE( + GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL); +GKO_STUB_INDEX_TYPE( + GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_LARGE_KERNEL); + + +} // namespace range_minimum_query + + namespace idx_set { diff --git a/core/test/components/CMakeLists.txt b/core/test/components/CMakeLists.txt index e88b373d246..77cf1679a21 100644 --- a/core/test/components/CMakeLists.txt +++ b/core/test/components/CMakeLists.txt @@ -1,2 +1,4 @@ ginkgo_create_test(addressable_pq) +ginkgo_create_test(bit_packed_storage) ginkgo_create_test(disjoint_sets) +ginkgo_create_test(range_minimum_query) diff --git a/core/test/components/bit_packed_storage.cpp b/core/test/components/bit_packed_storage.cpp new file mode 100644 index 00000000000..ca6ba4113af --- /dev/null +++ b/core/test/components/bit_packed_storage.cpp @@ -0,0 +1,139 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bit_packed_storage.hpp" + +#include + +#include + +#include "core/base/index_range.hpp" +#include "core/test/utils.hpp" +#include "gtest/gtest.h" + + +template +class BitPackedSpan : public ::testing::Test { +public: + using index_type = std::tuple_element_t<0, IndexWordType>; + using storage_type = std::tuple_element_t<1, IndexWordType>; + using bit_packed_span = + gko::bit_packed_span; + + std::default_random_engine rng{2457}; +}; + +using WordTypes = ::testing::Types, + std::pair>; + +TYPED_TEST_SUITE(BitPackedSpan, WordTypes, PairTypenameNameGenerator); + + +TYPED_TEST(BitPackedSpan, Works) +{ + using bit_packed_span = typename TestFixture::bit_packed_span; + using index_type = typename TestFixture::index_type; + using storage_type = typename TestFixture::storage_type; + for (const auto size : {0, 10, 100, 1000, 1023, 1024, 1025}) { + SCOPED_TRACE(size); + for (const auto num_bits : {2, 3, 5, 7, 8, 9, 31}) { + SCOPED_TRACE(num_bits); + const auto max_value = index_type{1} << num_bits; + std::vector packed_data( + bit_packed_span::storage_size(size, num_bits)); + std::vector packed_data2( + bit_packed_span::storage_size(size, num_bits)); + std::vector data(size); + std::vector retrieved_data(size); + std::vector retrieved_data2(size); + std::uniform_int_distribution dist{0, max_value - 1}; + for (auto& val : data) { + val = dist(this->rng); + } + // scramble packed_data2 to check proper initialization + std::uniform_int_distribution dist2{storage_type{}, + ~storage_type{}}; + for (auto& val : packed_data2) { + val = dist2(this->rng); + } + + bit_packed_span span{packed_data.data(), num_bits, size}; + bit_packed_span span2{packed_data2.data(), num_bits, size}; + for (const auto i : gko::irange{size}) { + span.set_from_zero(i, data[i]); + span2.set(i, data[i]); + } + + for (const auto i : gko::irange{size}) { + retrieved_data[i] = span.get(i); + retrieved_data2[i] = span2.get(i); + } + ASSERT_EQ(data, retrieved_data); + ASSERT_EQ(data, retrieved_data2); + } + } +} + + +template +struct size_info { + constexpr static int num_bits = num_bits_; + constexpr static int size = size_; +}; + + +template +class BitPackedArray : public ::testing::Test { +public: + constexpr static int num_bits = SizeInfoType::num_bits; + constexpr static int size = SizeInfoType::size; + using bit_packed_array = gko::bit_packed_array; + + std::default_random_engine rng{6735}; +}; + +using Sizes = + ::testing::Types, // single word, partially filled + size_info<1, 1024>, // multiple words + size_info<3, 8>, // single word, non power-of-two number + // of bits, fully filled + size_info<3, 9>, // multiple words, partially filled + size_info<5, 52>, // larger size + size_info<31, 3> // single word for each value + >; + +TYPED_TEST_SUITE(BitPackedArray, Sizes, TypenameNameGenerator); + + +TYPED_TEST(BitPackedArray, Works) +{ + constexpr auto num_bits = TestFixture::num_bits; + constexpr auto size = TestFixture::size; + using bit_packed_array = typename TestFixture::bit_packed_array; + using storage_type = typename bit_packed_array::storage_type; + const auto max_value = storage_type{2} << (num_bits - 1); + std::array data{}; + std::array retrieved_data{}; + std::array retrieved_data2{}; + std::uniform_int_distribution dist{ + 0, static_cast(max_value - 1)}; + for (auto& val : data) { + val = dist(this->rng); + } + + bit_packed_array array; + bit_packed_array array2; + for (const auto i : gko::irange{size}) { + array.set_from_zero(i, data[i]); + array2.set(i, dist(this->rng)); + array2.set(i, data[i]); + } + + for (const auto i : gko::irange{size}) { + retrieved_data[i] = array.get(i); + retrieved_data2[i] = array.get(i); + } + ASSERT_EQ(data, retrieved_data); + ASSERT_EQ(data, retrieved_data2); +} diff --git a/core/test/components/range_minimum_query.cpp b/core/test/components/range_minimum_query.cpp new file mode 100644 index 00000000000..02f4b30eaf5 --- /dev/null +++ b/core/test/components/range_minimum_query.cpp @@ -0,0 +1,170 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/range_minimum_query.hpp" + +#include + +#include + +#include "core/test/utils.hpp" + + +TEST(RangeMinimumQuery, RepresentativesAreExhaustive) +{ + constexpr auto size = 8; + using tree = gko::detail::cartesian_tree; + int values[size]{}; + std::iota(values, values + size, 0); + const auto reps = tree::compute_tree_representatives(); + do { + const auto tree_number = + tree::compute_tree_index(values, tree::ballot_number); + const auto rep_tree_number = + tree::compute_tree_index(reps[tree_number], tree::ballot_number); + + ASSERT_EQ(tree_number, rep_tree_number); + } while (std::next_permutation(values, values + size)); +} + + +TEST(RangeMinimumQuery, LookupRepresentatives) +{ + constexpr auto size = 8; + using tree = gko::detail::cartesian_tree; + gko::device_block_range_minimum_query_lookup_table table; + const auto reps = tree::compute_tree_representatives(); + for (const auto& rep : reps) { + const auto tree = tree::compute_tree_index(rep, tree::ballot_number); + for (const auto first : gko::irange{size}) { + for (const auto last : gko::irange{size}) { + const auto begin = rep + first; + const auto end = rep + last + 1; + const auto min_pos = + first > last ? 0 : std::min_element(begin, end) - rep; + + ASSERT_EQ(table.lookup(tree, first, last), min_pos); + } + } + } +} + + +TEST(RangeMinimumQuery, LookupExhaustive) +{ + constexpr auto size = 8; + gko::device_block_range_minimum_query_lookup_table table; + int values[size]{}; + std::iota(values, values + size, 0); + do { + const auto tree_number = table.compute_tree_index(values); + for (const auto first : gko::irange{size}) { + for (const auto last : gko::irange{first, size}) { + const auto lookup_val = table.lookup(tree_number, first, last); + const auto actual_val = + std::min_element(values + first, values + last + 1) - + values; + + ASSERT_EQ(lookup_val, actual_val); + } + } + } while (std::next_permutation(values, values + size)); +} + + +TEST(RangeMinimumQuery, OffsetsAreCorrect) +{ + constexpr auto data = gko::device_range_minimum_query_superblocks< + gko::int32>::compute_block_offset_lookup(); + constexpr auto data_long = gko::device_range_minimum_query_superblocks< + gko::int64>::compute_block_offset_lookup(); + ASSERT_EQ(data[0], 0); + ASSERT_EQ(data_long[0], 0); + // blocks of size 2^1 need 1 bit each + ASSERT_EQ(data[1], 1); + ASSERT_EQ(data_long[1], 1); + // blocks of size 2^2 need 2 bits each + ASSERT_EQ(data[2], 3); + ASSERT_EQ(data_long[2], 3); + // blocks of size 2^3 need 4 bits each + ASSERT_EQ(data[3], 7); + ASSERT_EQ(data_long[3], 7); + // blocks of size 2^4 need 4 bits each + ASSERT_EQ(data[4], 11); + ASSERT_EQ(data_long[4], 11); + // blocks of size 2^5 need 8 bits each + ASSERT_EQ(data[5], 19); + ASSERT_EQ(data_long[5], 19); + // blocks of size 2^6 need 8 bits each + ASSERT_EQ(data[6], 27); + ASSERT_EQ(data_long[6], 27); + // blocks of size 2^7 need 8 bits each + ASSERT_EQ(data[7], 35); + ASSERT_EQ(data_long[7], 35); + // blocks of size 2^8 need 8 bits each + ASSERT_EQ(data[8], 43); + ASSERT_EQ(data_long[8], 43); + // blocks of size 2^9 - 2^16 need 16 bits each + ASSERT_EQ(data[9], 59); + ASSERT_EQ(data_long[9], 59); + ASSERT_EQ(data[16], 171); + ASSERT_EQ(data_long[16], 171); + // blocks of size 2^17-2^32 need 32 bits each + ASSERT_EQ(data[31], 651); + ASSERT_EQ(data_long[31], 651); + ASSERT_EQ(data_long[32], 683); + // blocks of size 2^33-2^64 need 64 bits each + ASSERT_EQ(data_long[63], 2667); +} + + +TEST(RangeMinimumQuery, NumLevelsIsCorrect) +{ + const auto test = [](auto value) { + using index_type = decltype(value); + using superblocks = + gko::device_range_minimum_query_superblocks; + ASSERT_EQ(superblocks::num_levels(0), 0); + ASSERT_EQ(superblocks::num_levels(1), 0); + ASSERT_EQ(superblocks::num_levels(2), 1); + ASSERT_EQ(superblocks::num_levels(3), 1); + ASSERT_EQ(superblocks::num_levels(4), 1); + ASSERT_EQ(superblocks::num_levels(5), 2); + ASSERT_EQ(superblocks::num_levels(8), 2); + ASSERT_EQ(superblocks::num_levels(9), 3); + ASSERT_EQ(superblocks::num_levels(16), 3); + ASSERT_EQ(superblocks::num_levels(17), 4); + ASSERT_EQ(superblocks::num_levels(32), 4); + ASSERT_EQ(superblocks::num_levels(33), 5); + ASSERT_EQ( + superblocks::num_levels(std::numeric_limits::max()), + sizeof(index_type) * CHAR_BIT - 2); + }; + test(gko::int32{}); + test(gko::int64{}); +} + + +TEST(RangeMinimumQuery, LevelForDistanceIsCorrect) +{ + const auto test = [](auto value) { + using index_type = decltype(value); + using superblocks = + gko::device_range_minimum_query_superblocks; + ASSERT_EQ(superblocks::level_for_distance(0), 0); + ASSERT_EQ(superblocks::level_for_distance(1), 0); + ASSERT_EQ(superblocks::level_for_distance(2), 0); + ASSERT_EQ(superblocks::level_for_distance(3), 0); + ASSERT_EQ(superblocks::level_for_distance(4), 1); + ASSERT_EQ(superblocks::level_for_distance(7), 1); + ASSERT_EQ(superblocks::level_for_distance(8), 2); + ASSERT_EQ(superblocks::level_for_distance(15), 2); + ASSERT_EQ(superblocks::level_for_distance(16), 3); + ASSERT_EQ(superblocks::level_for_distance( + std::numeric_limits::max()), + sizeof(index_type) * CHAR_BIT - 3); + }; + test(gko::int32{}); + test(gko::int64{}); +} diff --git a/examples/batched-solver/batched-solver.cpp b/examples/batched-solver/batched-solver.cpp index b5097d85258..dea91ad39e8 100644 --- a/examples/batched-solver/batched-solver.cpp +++ b/examples/batched-solver/batched-solver.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -54,7 +54,7 @@ auto unbatch(const InputType* batch_object) // // We use raw pointers below to demonstrate how to handle the situation when // the application only gives us raw pointers. Ideally, one should use -// Ginkgo's gko::Array class here. In this example, we assume that the data is +// Ginkgo's gko::array class here. In this example, we assume that the data is // in a format that can directly be given to a batch::matrix::Csr object. struct ApplSysData { // Number of small systems in the batch. diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 2798f18578f..33f01628910 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -37,6 +37,10 @@ #cmakedefine GKO_HAVE_CXXABI_H +/* Is std::size_t the same type as std::uint64_t? */ +#cmakedefine GKO_SIZE_T_IS_UINT64_T + + /* Should we use all optimizations for Jacobi? */ #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp index 37e7f361781..e89d3efae35 100644 --- a/include/ginkgo/core/base/intrinsics.hpp +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -21,7 +21,7 @@ namespace detail { GKO_ATTRIBUTES GKO_INLINE int popcount(uint32 bitmask) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __popc(bitmask); + return __popc(static_cast(bitmask)); #else std::bitset<32> bits{bitmask}; return bits.count(); @@ -29,13 +29,11 @@ GKO_ATTRIBUTES GKO_INLINE int popcount(uint32 bitmask) } -/** - * Returns the number of set bits in the given bitmask. - */ +/** @copydoc popcount(uint32) */ GKO_ATTRIBUTES GKO_INLINE int popcount(uint64 bitmask) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __popcll(bitmask); + return __popcll(static_cast(bitmask)); #else std::bitset<64> bits{bitmask}; return bits.count(); diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 27c04a4db09..9bea1190ef1 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -9,6 +9,7 @@ target_sources(ginkgo_reference components/absolute_array_kernels.cpp components/fill_array_kernels.cpp components/format_conversion_kernels.cpp + components/range_minimum_query_kernels.cpp components/reduce_array_kernels.cpp components/precision_conversion_kernels.cpp components/prefix_sum_kernels.cpp diff --git a/reference/components/fill_array_kernels.cpp b/reference/components/fill_array_kernels.cpp index 1649aa87982..5bbfab3bec4 100644 --- a/reference/components/fill_array_kernels.cpp +++ b/reference/components/fill_array_kernels.cpp @@ -1,10 +1,11 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause #include "core/components/fill_array_kernels.hpp" #include +#include namespace gko { @@ -22,6 +23,11 @@ void fill_array(std::shared_ptr exec, ValueType* array, GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); template GKO_DECLARE_FILL_ARRAY_KERNEL(bool); +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint16); +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint32); +#ifndef GKO_SIZE_T_IS_UINT64_T +template GKO_DECLARE_FILL_ARRAY_KERNEL(uint64); +#endif template diff --git a/reference/components/range_minimum_query_kernels.cpp b/reference/components/range_minimum_query_kernels.cpp new file mode 100644 index 00000000000..1de963ae668 --- /dev/null +++ b/reference/components/range_minimum_query_kernels.cpp @@ -0,0 +1,105 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/range_minimum_query_kernels.hpp" + +#include + +#include "core/base/intrinsics.hpp" +#include "core/components/bit_packed_storage.hpp" +#include "core/components/range_minimum_query.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace range_minimum_query { + + +template +void compute_lookup_inside_blocks( + std::shared_ptr exec, const IndexType* values, + IndexType size, bit_packed_span& block_argmin, + IndexType* block_min, uint16* block_tree_index) +{ + using rmq_type = gko::range_minimum_query; + constexpr auto block_size = rmq_type::block_size; + using tree_index_type = std::decay_t; + using lut_type = typename rmq_type::block_lut_view_type; + lut_type table; + static_assert( + lut_type::num_trees <= std::numeric_limits::max(), + "block type storage too small"); + for (IndexType i = 0; i < size; i += block_size) { + IndexType local_values[block_size]; + for (int local_i = 0; local_i < block_size; local_i++) { + // use "infinity" as sentinel for minimum computations + local_values[local_i] = local_i + i < size + ? values[local_i + i] + : std::numeric_limits::max(); + } + const auto tree_number = table.compute_tree_index(local_values); + const auto min_it = + std::min_element(local_values, local_values + block_size); + const auto min_idx = + static_cast(std::distance(local_values, min_it)); + const auto block_idx = i / block_size; + block_argmin.set(block_idx, min_idx); + block_min[block_idx] = *min_it; + block_tree_index[block_idx] = static_cast(tree_number); + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL); + + +template +void compute_lookup_across_blocks( + std::shared_ptr exec, const IndexType* block_min, + IndexType num_blocks, + device_range_minimum_query_superblocks& superblocks) +{ + using superblock_type = device_range_minimum_query_superblocks; + constexpr auto infinity = std::numeric_limits::max(); + if (num_blocks < 2) { + return; + } + // initialize the first level of blocks + for (const auto i : irange{num_blocks}) { + const auto min1 = block_min[i]; + const auto min2 = i + 1 < num_blocks ? block_min[i + 1] : infinity; + // we need to use <= here to make sure ties always break to the left + superblocks.set_block_argmin(0, i, min1 <= min2 ? 0 : 1); + } + // we computed argmins for blocks of size 2, now recursively combine them. + const auto num_levels = superblocks.num_levels(); + for (const auto block_level : irange{1, num_levels}) { + const auto block_size = + superblock_type::block_size_for_level(block_level); + for (const auto i : irange{num_blocks}) { + const auto i2 = i + block_size / 2; + const auto argmin1 = + i + superblocks.block_argmin(block_level - 1, i); + const auto argmin2 = + i2 < num_blocks + ? i2 + superblocks.block_argmin(block_level - 1, i2) + : argmin1; + const auto min1 = block_min[argmin1]; + const auto min2 = block_min[argmin2]; + // we need to use <= here to make sure ties always break to the left + superblocks.set_block_argmin( + block_level, i, min1 <= min2 ? argmin1 - i : argmin2 - i); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_LARGE_KERNEL); + + +} // namespace range_minimum_query +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt index e9737f5c106..b17880ab32d 100644 --- a/reference/test/components/CMakeLists.txt +++ b/reference/test/components/CMakeLists.txt @@ -3,4 +3,5 @@ ginkgo_create_test(fill_array_kernels) ginkgo_create_test(format_conversion_kernels) ginkgo_create_test(precision_conversion_kernels) ginkgo_create_test(prefix_sum_kernels) +ginkgo_create_test(range_minimum_query_kernels) ginkgo_create_test(reduce_array_kernels) diff --git a/reference/test/components/range_minimum_query_kernels.cpp b/reference/test/components/range_minimum_query_kernels.cpp new file mode 100644 index 00000000000..8bbe131fdf6 --- /dev/null +++ b/reference/test/components/range_minimum_query_kernels.cpp @@ -0,0 +1,211 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/range_minimum_query_kernels.hpp" + +#include +#include +#include +#include + +#include + +#include "core/base/index_range.hpp" +#include "core/components/range_minimum_query.hpp" +#include "core/test/utils.hpp" + + +template +class RangeMinimumQuery : public ::testing::Test { +protected: + using index_type = IndexType; + using storage_type = std::make_unsigned_t; + using device_type = gko::range_minimum_query; + using block_argmin_view_type = typename device_type::block_argmin_view_type; + using superblock_view_type = typename device_type::superblock_view_type; + constexpr static auto block_size = device_type::block_size; + RangeMinimumQuery() + : ref{gko::ReferenceExecutor::create()}, + rng{167349}, + // keep these in sync with small_block_size: + // we should cover a single incomplete block, multiple blocks with the + // last block being either complete or incomplete + sizes{0, 1, 2, 3, 7, 8, 9, 10, 15, 16, 17, 25, 127, 128, 129, 1023} + {} + + std::vector create_random_values(index_type size) + { + std::vector values(size); + std::uniform_int_distribution dist(0, 10000); + for (auto& value : values) { + value = dist(this->rng); + } + return values; + } + + std::shared_ptr ref; + std::default_random_engine rng; + std::vector sizes; +}; + +TYPED_TEST_SUITE(RangeMinimumQuery, gko::test::IndexTypes, + TypenameNameGenerator); + + +TYPED_TEST(RangeMinimumQuery, ComputeLookupSmall) +{ + using index_type = typename TestFixture::index_type; + using block_argmin_view_type = typename TestFixture::block_argmin_view_type; + constexpr auto block_size = TestFixture::block_size; + constexpr auto block_argmin_num_bits = gko::ceil_log2_constexpr(block_size); + for (index_type size : this->sizes) { + SCOPED_TRACE(size); + const auto values = this->create_random_values(size); + const auto num_blocks = + static_cast(gko::ceildiv(size, block_size)); + std::vector block_argmin_storage( + block_argmin_view_type::storage_size(num_blocks, + block_argmin_num_bits)); + block_argmin_view_type block_argmin{block_argmin_storage.data(), + block_argmin_num_bits, num_blocks}; + std::vector block_min(num_blocks); + std::vector block_tree_index(num_blocks); + gko::device_block_range_minimum_query_lookup_table + small_lut; + + gko::kernels::reference::range_minimum_query:: + compute_lookup_inside_blocks(this->ref, values.data(), size, + block_argmin, block_min.data(), + block_tree_index.data()); + + for (auto block : gko::irange{num_blocks}) { + SCOPED_TRACE(block); + const auto block_begin = values.begin() + block * block_size; + const auto block_end = + values.begin() + std::min(size, (block + 1) * block_size); + const auto block_local_size = block_end - block_begin; + const auto min_it = std::min_element(block_begin, block_end); + const auto min_value = *min_it; + const auto min_pos = min_it - block_begin; + ASSERT_EQ(min_pos, block_argmin.get(block)); + ASSERT_EQ(min_value, block_min[block]); + const auto tree = block_tree_index[block]; + for (auto first : gko::irange{block_local_size}) { + for (auto last : gko::irange{first, block_local_size}) { + const auto argmin = std::distance( + block_begin, std::min_element(block_begin + first, + block_begin + last + 1)); + ASSERT_EQ(argmin, small_lut.lookup(static_cast(tree), + first, last)) + << "in range [" << first << "," << last << "]"; + } + } + } + } +} + +TYPED_TEST(RangeMinimumQuery, ComputeLookupLarge) +{ + using index_type = typename TestFixture::index_type; + using superblock_view_type = typename TestFixture::superblock_view_type; + using storage_type = typename TestFixture::storage_type; + for (index_type num_blocks : this->sizes) { + SCOPED_TRACE(num_blocks); + const auto block_min = this->create_random_values(num_blocks); + std::vector superblock_storage( + superblock_view_type::storage_size(num_blocks)); + superblock_view_type superblocks(block_min.data(), + superblock_storage.data(), num_blocks); + + gko::kernels::reference::range_minimum_query:: + compute_lookup_across_blocks(this->ref, block_min.data(), + num_blocks, superblocks); + + for (auto level : gko::irange(superblocks.num_levels())) { + SCOPED_TRACE(level); + const auto block_size = + superblock_view_type::block_size_for_level(level); + for (auto block : gko::irange(num_blocks)) { + const auto begin = block_min.begin() + block; + const auto end = block_min.begin() + + std::min(block + block_size, num_blocks); + const auto argmin = std::min_element(begin, end) - begin; + ASSERT_EQ(superblocks.block_argmin(level, block), argmin); + } + } + } +} + + +TYPED_TEST(RangeMinimumQuery, SuperblockQuery) +{ + using index_type = typename TestFixture::index_type; + using superblock_view_type = typename TestFixture::superblock_view_type; + using storage_type = typename TestFixture::storage_type; + for (index_type num_blocks : this->sizes) { + SCOPED_TRACE(num_blocks); + const auto block_min = this->create_random_values(num_blocks); + std::vector superblock_storage( + superblock_view_type::storage_size(num_blocks)); + superblock_view_type superblocks(block_min.data(), + superblock_storage.data(), num_blocks); + gko::kernels::reference::range_minimum_query:: + compute_lookup_across_blocks(this->ref, block_min.data(), + num_blocks, superblocks); + for (auto first : gko::irange{num_blocks}) { + SCOPED_TRACE(first); + for (auto last : gko::irange{first, num_blocks}) { + SCOPED_TRACE(last); + const auto begin = block_min.begin() + first; + const auto end = block_min.begin() + last + 1; + const auto min_it = std::min_element(begin, end); + const auto argmin = std::distance(block_min.begin(), min_it); + const auto min = *min_it; + + const auto result = superblocks.query(first, last); + + // checking min first tells us when the minimum is correct, but + // the location is incorrect + ASSERT_EQ(min, result.min); + ASSERT_EQ(argmin, result.argmin); + } + } + } +} + + +TYPED_TEST(RangeMinimumQuery, FullQuery) +{ + using index_type = typename TestFixture::index_type; + using block_argmin_view_type = typename TestFixture::block_argmin_view_type; + constexpr auto block_size = TestFixture::block_size; + constexpr auto block_argmin_num_bits = gko::ceil_log2_constexpr(block_size); + using superblock_view_type = typename TestFixture::superblock_view_type; + using storage_type = typename TestFixture::storage_type; + for (index_type size : this->sizes) { + SCOPED_TRACE(size); + const auto values = this->create_random_values(size); + gko::range_minimum_query rmq{ + gko::array{this->ref, values.begin(), values.end()}}; + + for (auto first : gko::irange{size}) { + SCOPED_TRACE(first); + for (auto last : gko::irange{first, size}) { + SCOPED_TRACE(last); + const auto begin = values.begin() + first; + const auto end = values.begin() + last + 1; + const auto min_it = std::min_element(begin, end); + const auto argmin = std::distance(values.begin(), min_it); + const auto min = *min_it; + + const auto result = rmq.get().query(first, last); + + // checking min first tells us when the minimum is correct, but + // the location is incorrect + ASSERT_EQ(min, result.min); + ASSERT_EQ(argmin, result.argmin); + } + } + } +} diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt index bc2ea73620f..5763f5be69b 100644 --- a/test/base/CMakeLists.txt +++ b/test/base/CMakeLists.txt @@ -1,6 +1,7 @@ ginkgo_create_common_test(batch_multi_vector_kernels) ginkgo_create_common_and_reference_test(device_matrix_data_kernels) ginkgo_create_common_device_test(index_range) +ginkgo_create_common_device_test(intrinsics) ginkgo_create_common_device_test(iterator_factory) ginkgo_create_common_device_test(kernel_launch_generic) ginkgo_create_common_and_reference_test(executor) diff --git a/test/base/intrinsics.cpp b/test/base/intrinsics.cpp new file mode 100644 index 00000000000..727a55e5993 --- /dev/null +++ b/test/base/intrinsics.cpp @@ -0,0 +1,119 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/base/intrinsics.hpp" + +#include + +#include + +#include + +#include "common/unified/base/kernel_launch.hpp" +#include "core/test/utils.hpp" +#include "test/utils/common_fixture.hpp" + + +class Intrinsics : public CommonTestFixture { +public: + Intrinsics() + : input_array32{exec, {1u, 0x80ffu, 0x100u, ~0u}}, + input_array64{exec, {1ull, 0x1000f48006010400ull, 0x40000ull, ~0ull}}, + output_array{exec, 4} + {} + + gko::array input_array32; + gko::array input_array64; + gko::array output_array; +}; + + +// nvcc doesn't like device lambdas declared in complex classes, move it out +template +void run_popcount(std::shared_ptr exec, gko::size_type size, + T* in, int* out) +{ + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( + exec, + [] GKO_KERNEL(auto i, auto in, auto out) { + out[i] = gko::detail::popcount(in[i]); + }, + size, in, out); +} + +TEST_F(Intrinsics, Popcount32) +{ + run_popcount(exec, input_array32.get_size(), input_array32.get_data(), + output_array.get_data()); + + GKO_ASSERT_ARRAY_EQ(output_array, (I{1, 9, 1, 32})); +} + +TEST_F(Intrinsics, Popcount64) +{ + run_popcount(exec, input_array64.get_size(), input_array64.get_data(), + output_array.get_data()); + + GKO_ASSERT_ARRAY_EQ(output_array, (I{1, 11, 1, 64})); +} + + +// nvcc doesn't like device lambdas declared in complex classes, move it out +template +void run_find_lowest_bit(std::shared_ptr exec, + gko::size_type size, T* in, int* out) +{ + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( + exec, + [] GKO_KERNEL(auto i, auto in, auto out) { + out[i] = gko::detail::find_lowest_bit(in[i]); + }, + size, in, out); +} + +TEST_F(Intrinsics, FindLowestBit32) +{ + run_find_lowest_bit(exec, input_array32.get_size(), + input_array32.get_data(), output_array.get_data()); + + GKO_ASSERT_ARRAY_EQ(output_array, (I{0, 0, 8, 0})); +} + +TEST_F(Intrinsics, FindLowestBit64) +{ + run_find_lowest_bit(exec, input_array64.get_size(), + input_array64.get_data(), output_array.get_data()); + + GKO_ASSERT_ARRAY_EQ(output_array, (I{0, 10, 18, 0})); +} + + +// nvcc doesn't like device lambdas declared in complex classes, move it out +template +void run_find_highest_bit(std::shared_ptr exec, + gko::size_type size, T* in, int* out) +{ + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( + exec, + [] GKO_KERNEL(auto i, auto in, auto out) { + out[i] = gko::detail::find_highest_bit(in[i]); + }, + size, in, out); +} + +TEST_F(Intrinsics, FindHighestBit32) +{ + run_find_highest_bit(exec, input_array32.get_size(), + input_array32.get_data(), output_array.get_data()); + + GKO_ASSERT_ARRAY_EQ(output_array, (I{0, 15, 8, 31})); +} + +TEST_F(Intrinsics, FindHighestBit64) +{ + run_find_highest_bit(exec, input_array64.get_size(), + input_array64.get_data(), output_array.get_data()); + + GKO_ASSERT_ARRAY_EQ(output_array, (I{0, 60, 18, 63})); +} diff --git a/test/components/CMakeLists.txt b/test/components/CMakeLists.txt index 9b7d5c1aef7..12e738d8eaa 100644 --- a/test/components/CMakeLists.txt +++ b/test/components/CMakeLists.txt @@ -3,4 +3,5 @@ ginkgo_create_common_test(fill_array_kernels) ginkgo_create_common_test(format_conversion_kernels) ginkgo_create_common_test(precision_conversion_kernels) ginkgo_create_common_test(prefix_sum_kernels) +ginkgo_create_common_device_test(range_minimum_query_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(reduce_array_kernels) diff --git a/test/components/range_minimum_query_kernels.cpp b/test/components/range_minimum_query_kernels.cpp new file mode 100644 index 00000000000..88ba5c8f089 --- /dev/null +++ b/test/components/range_minimum_query_kernels.cpp @@ -0,0 +1,245 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/range_minimum_query_kernels.hpp" + +#include +#include +#include +#include +#include + +#include + +#include + +#include "common/unified/base/kernel_launch.hpp" +#include "core/base/index_range.hpp" +#include "core/test/utils.hpp" +#include "test/utils/common_fixture.hpp" + + +// workaround for cudafe 11.0 bug +using gko::irange; + + +template +class RangeMinimumQuery : public CommonTestFixture { +protected: + using index_type = T; + using storage_type = std::make_unsigned_t; + using block_argmin_view_type = + typename gko::range_minimum_query::block_argmin_view_type; + using superblock_view_type = + gko::device_range_minimum_query_superblocks; + constexpr static auto block_size = + gko::range_minimum_query::block_size; + + RangeMinimumQuery() + : rng{19654}, sizes{0, 1, 7, 8, 9, 1023, 1024, 1025, 10000, 100000} + {} + + gko::array create_random_values(index_type size) + { + gko::array data{this->ref, + static_cast(size)}; + std::uniform_int_distribution dist{ + 0, std::numeric_limits::max()}; + for (auto i : gko::irange{size}) { + data.get_data()[i] = dist(rng); + } + return data; + } + + std::pair, gko::array> + create_random_queries(index_type size) + { + std::vector begins; + std::vector ends; + if (size > 0) { + std::uniform_int_distribution dist{0, size - 1}; + const auto add_query = [&](index_type begin, index_type end) { + assert(begin <= end); + assert(begin >= 0); + assert(end < size); + begins.push_back(begin); + ends.push_back(end); + }; + for (const auto i : gko::irange{size}) { + // singleton queries + add_query(i, i); + } + for (const auto i : gko::irange{100}) { + // random block-local queries + const auto begin = dist(rng); + std::uniform_int_distribution end_dist{ + begin, std::min(size - 1, + (begin / block_size + 1) * block_size - 1)}; + add_query(begin, end_dist(rng)); + } + for (const auto i : gko::irange{100}) { + // random two-block queries + const auto begin = dist(rng); + std::uniform_int_distribution end_dist{ + std::min(size - 1, + (begin / block_size + 1) * block_size - 1), + std::min(size - 1, + (begin / block_size + 2) * block_size - 1)}; + add_query(begin, end_dist(rng)); + } + for (const auto i : gko::irange{100}) { + // random arbitrary queries + const auto begin = dist(rng); + std::uniform_int_distribution end_dist{begin, + size - 1}; + add_query(begin, end_dist(rng)); + } + } + return std::make_pair( + gko::array{this->ref, begins.begin(), begins.end()}, + gko::array{this->ref, ends.begin(), ends.end()}); + } + + std::default_random_engine rng; + std::vector sizes; +}; + +TYPED_TEST_SUITE(RangeMinimumQuery, gko::test::IndexTypes, + TypenameNameGenerator); + + +TYPED_TEST(RangeMinimumQuery, ComputeLookupSmallAndLargeIsEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using storage_type = typename TestFixture::storage_type; + using block_argmin_view_type = typename TestFixture::block_argmin_view_type; + using superblock_view_type = typename TestFixture::superblock_view_type; + constexpr auto block_size = TestFixture::block_size; + constexpr auto block_argmin_num_bits = gko::ceil_log2_constexpr(block_size); + for (auto size : this->sizes) { + SCOPED_TRACE(size); + auto values = this->create_random_values(size); + auto dvalues = gko::array{this->exec, values}; + const auto num_blocks = + static_cast(gko::ceildiv(size, block_size)); + gko::array block_min{this->ref, num_blocks}; + gko::array dblock_min{this->exec, num_blocks}; + const auto block_argmin_storage_size = + static_cast(block_argmin_view_type::storage_size( + num_blocks, block_argmin_num_bits)); + gko::array block_argmin_storage{this->ref, + block_argmin_storage_size}; + gko::array dblock_argmin_storage{ + this->exec, block_argmin_storage_size}; + block_argmin_storage.fill(0); + dblock_argmin_storage.fill(0); + block_argmin_view_type block_argmin{ + block_argmin_storage.get_data(), block_argmin_num_bits, + static_cast(num_blocks)}; + block_argmin_view_type dblock_argmin{ + dblock_argmin_storage.get_data(), block_argmin_num_bits, + static_cast(num_blocks)}; + gko::array block_tree_index{this->ref, num_blocks}; + gko::array dblock_tree_index{this->exec, num_blocks}; + + gko::kernels::reference::range_minimum_query:: + compute_lookup_inside_blocks( + this->ref, values.get_const_data(), size, block_argmin, + block_min.get_data(), block_tree_index.get_data()); + gko::kernels::GKO_DEVICE_NAMESPACE::range_minimum_query:: + compute_lookup_inside_blocks( + this->exec, dvalues.get_const_data(), size, dblock_argmin, + dblock_min.get_data(), dblock_tree_index.get_data()); + + GKO_ASSERT_ARRAY_EQ(block_min, dblock_min); + GKO_ASSERT_ARRAY_EQ(block_tree_index, dblock_tree_index); + GKO_ASSERT_ARRAY_EQ(block_argmin_storage, dblock_argmin_storage); + + if (num_blocks > 1) { + const auto superblock_storage_size = static_cast( + superblock_view_type::storage_size(num_blocks)); + gko::array superblock_storage{ + this->ref, superblock_storage_size}; + gko::array dsuperblock_storage{ + this->exec, superblock_storage_size}; + superblock_storage.fill(0); + dsuperblock_storage.fill(0); + superblock_view_type superblocks{ + block_min.get_const_data(), superblock_storage.get_data(), + static_cast(num_blocks)}; + superblock_view_type dsuperblocks{ + dblock_min.get_const_data(), dsuperblock_storage.get_data(), + static_cast(num_blocks)}; + + gko::kernels::reference::range_minimum_query:: + compute_lookup_across_blocks( + this->ref, block_min.get_const_data(), + static_cast(num_blocks), superblocks); + gko::kernels::GKO_DEVICE_NAMESPACE::range_minimum_query:: + compute_lookup_across_blocks( + this->exec, dblock_min.get_const_data(), + static_cast(num_blocks), dsuperblocks); + + GKO_ASSERT_ARRAY_EQ(superblock_storage, dsuperblock_storage); + } + } +} + + +// nvcc doesn't like device lambdas inside class member functions + +template +void run_rmq_device(std::shared_ptr exec, + const gko::array& dbegins, + const gko::array& dends, + const gko::range_minimum_query& drmq, + gko::array& doutput_min, + gko::array& doutput_argmin) +{ + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( + exec, + [] GKO_KERNEL(auto i, auto begins, auto ends, auto rmq, auto output_min, + auto output_argmin) { + const auto begin = begins[i]; + const auto end = ends[i]; + const auto result = rmq.query(begin, end); + output_min[i] = result.min; + output_argmin[i] = result.argmin; + }, + dbegins.get_size(), dbegins, dends, drmq.get(), doutput_min, + doutput_argmin); +} + + +TYPED_TEST(RangeMinimumQuery, QueryIsEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + auto values = this->create_random_values(size); + const auto [begins, ends] = this->create_random_queries(size); + gko::array output_argmin{this->ref, begins.get_size()}; + gko::array output_min{this->ref, begins.get_size()}; + gko::array dvalues{this->exec, values}; + gko::array dbegins{this->exec, begins}; + gko::array dends{this->exec, ends}; + gko::array doutput_min{this->exec, begins.get_size()}; + gko::array doutput_argmin{this->exec, begins.get_size()}; + gko::range_minimum_query rmq{std::move(values)}; + gko::range_minimum_query drmq{std::move(dvalues)}; + + for (const auto i : + gko::irange{static_cast(begins.get_size())}) { + const auto result = rmq.get().query(begins.get_const_data()[i], + ends.get_const_data()[i]); + output_min.get_data()[i] = result.min; + output_argmin.get_data()[i] = result.argmin; + } + run_rmq_device(this->exec, dbegins, dends, drmq, doutput_min, + doutput_argmin); + + GKO_ASSERT_ARRAY_EQ(output_min, doutput_min); + GKO_ASSERT_ARRAY_EQ(output_argmin, doutput_argmin); + } +} diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 752342a8e64..44248cdda9d 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -443,8 +443,8 @@ class VectorReductions : public CommonMpiTestFixture { dense_real_res = real_dense_type ::create(exec); real_res = real_dense_type ::create(exec); - dense_tmp = gko::Array(exec); - tmp = gko::Array(exec); + dense_tmp = gko::array(exec); + tmp = gko::array(exec); auto num_parts = static_cast(