Merge range minimum query kernels (#1764)

This adds kernels for building a range minimum query data structure and support for querying it from device kernels. They will be used for symbolic Cholesky LCA queries. Related PR: #1764
ginkgo-project · Jan 20, 2025 · efbf245 · efbf245
2 parents 0b3436e + aca9c69
commit efbf245
Show file tree

Hide file tree

Showing 32 changed files with 2,488 additions and 20 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -106,7 +106,8 @@ build/cuda110/nompi/gcc/cuda/release/shared:
     # fix gtest issue https://github.com/google/googletest/issues/3514
     CXX_FLAGS: "-Wno-error=maybe-uninitialized"
     # disable spurious unused argument warning
-    EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
+    # this is seemingly broken with CUDA 11
+    # EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
 
 
 # nvhpc and friends

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -122,6 +122,7 @@ include(cmake/build_type_helpers.cmake)
 # Load other CMake helpers
 include(cmake/build_helpers.cmake)
 include(cmake/install_helpers.cmake)
+include(cmake/compiler_features.cmake)
 
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")

diff --git a/cmake/compiler_features.cmake b/cmake/compiler_features.cmake
@@ -0,0 +1,8 @@
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles(
+    "#include <type_traits>
+    #include <cstdint>
+    static_assert(std::is_same<std::uint64_t, std::size_t>::value, \"INSTANTIATE_UINT64\");
+    int main() {}"
+    GKO_SIZE_T_IS_UINT64_T
+    FAIL_REGEX ".*INSTANTIATE_UINT64.#")
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
@@ -273,25 +273,25 @@ endfunction(ginkgo_create_common_test_internal)
 function(ginkgo_create_common_device_test test_name)
     cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}")
     ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
-    if(GINKGO_BUILD_SYCL)
+    if(GINKGO_BUILD_SYCL AND NOT ("dpcpp" IN_LIST common_device_test_DISABLE_EXECUTORS))
         ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN})
         target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS})
         # We need to use a new file to avoid sycl setting in other backends because add_sycl_to_target will change the source property.
         configure_file(${test_name}.cpp ${test_name}.dp.cpp COPYONLY)
         gko_add_sycl_to_target(TARGET ${test_target_name}_dpcpp SOURCES ${test_name}.dp.cpp)
         target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel)
     endif()
-    if(GINKGO_BUILD_OMP)
+    if(GINKGO_BUILD_OMP AND NOT ("omp" IN_LIST common_device_test_DISABLE_EXECUTORS))
         ginkgo_create_common_test_internal(${test_name} OmpExecutor omp ${ARGN})
         target_link_libraries(${test_target_name}_omp PUBLIC OpenMP::OpenMP_CXX)
     endif()
-    if(GINKGO_BUILD_CUDA)
+    if(GINKGO_BUILD_CUDA AND NOT ("cuda" IN_LIST common_device_test_DISABLE_EXECUTORS))
         # need to make a separate file for this, since we can't set conflicting properties on the same file
         configure_file(${test_name}.cpp ${test_name}.cu COPYONLY)
         ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN})
         target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor GKO_DEVICE_NAMESPACE=cuda)
     endif()
-    if(GINKGO_BUILD_HIP)
+    if(GINKGO_BUILD_HIP AND NOT ("hip" IN_LIST common_device_test_DISABLE_EXECUTORS))
         # need to make a separate file for this, since we can't set conflicting properties on the same file
         configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY)
         ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN})

diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt
@@ -5,6 +5,7 @@ set(UNIFIED_SOURCES
     components/fill_array_kernels.cpp
     components/format_conversion_kernels.cpp
     components/precision_conversion_kernels.cpp
+    components/range_minimum_query_kernels.cpp
     components/reduce_array_kernels.cpp
     distributed/assembly_kernels.cpp
     distributed/partition_helpers_kernels.cpp

diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp
@@ -1,9 +1,11 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "core/components/fill_array_kernels.hpp"
 
+#include <type_traits>
+
 #include "common/unified/base/kernel_launch.hpp"
 
 
@@ -25,6 +27,11 @@ void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType* array,
 
 GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
 template GKO_DECLARE_FILL_ARRAY_KERNEL(bool);
+template GKO_DECLARE_FILL_ARRAY_KERNEL(uint16);
+template GKO_DECLARE_FILL_ARRAY_KERNEL(uint32);
+#ifndef GKO_SIZE_T_IS_UINT64_T
+template GKO_DECLARE_FILL_ARRAY_KERNEL(uint64);
+#endif
 
 
 template <typename ValueType>

diff --git a/common/unified/components/range_minimum_query_kernels.cpp b/common/unified/components/range_minimum_query_kernels.cpp
@@ -0,0 +1,184 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/components/range_minimum_query_kernels.hpp"
+
+#include <limits>
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/base/intrinsics.hpp"
+#include "core/components/bit_packed_storage.hpp"
+#include "core/components/range_minimum_query.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace range_minimum_query {
+
+
+template <typename IndexType>
+void compute_lookup_inside_blocks(
+    std::shared_ptr<const DefaultExecutor> exec, const IndexType* values,
+    IndexType size, bit_packed_span<int, IndexType, uint32>& block_argmin,
+    IndexType* block_min, uint16* block_tree_index)
+{
+#ifdef GKO_COMPILING_DPCPP
+    // The Intel SYCL compiler doesn't support constexpr initialization of
+    // non-trivial objects on the device.
+    GKO_NOT_IMPLEMENTED;
+#else
+    using rmq_type = gko::range_minimum_query<IndexType>;
+    constexpr auto block_size = rmq_type::block_size;
+    using tree_index_type = std::decay_t<decltype(*block_tree_index)>;
+    using device_lut_type = typename rmq_type::block_lut_type;
+    using lut_type = typename rmq_type::block_lut_view_type;
+    static_assert(
+        lut_type::num_trees <= std::numeric_limits<tree_index_type>::max(),
+        "block type storage too small");
+    // block_argmin stores multiple values per memory word, so we need to make
+    // sure that no two different threads write to the same memory location.
+    // The easiest way to do that is to have every thread handle all elements
+    // that map to the same memory location.
+    // The argmin inside a block is in the range [0, block_size - 1], so
+    // it needs ceil_log2_constexpr(block_size) bits. For efficiency
+    // reasons, we round that up to the next power of two.
+    // This expression is essentially bits_per_word /
+    // round_up_pow2_constexpr(ceil_log2_constexpr(block_size)), i.e. how
+    // many values are stored per word.
+    constexpr auto collation_width =
+        1 << (std::decay_t<decltype(block_argmin)>::bits_per_word_log2 -
+              ceil_log2_constexpr(ceil_log2_constexpr(block_size)));
+    const device_lut_type lut{exec};
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto collated_block_idx, auto values, auto block_argmin,
+                      auto block_min, auto block_tree_index, auto lut,
+                      auto size) {
+            // we need to put this here because some compilers interpret capture
+            // rules around constexpr incorrectly
+            constexpr auto block_size = rmq_type::block_size;
+            constexpr auto infinity = std::numeric_limits<IndexType>::max();
+            const auto num_blocks = ceildiv(size, block_size);
+            for (auto block_idx = collated_block_idx * collation_width;
+                 block_idx <
+                 std::min<int64>((collated_block_idx + 1) * collation_width,
+                                 num_blocks);
+                 block_idx++) {
+                const auto i = block_idx * block_size;
+                IndexType local_values[block_size];
+                int argmin = 0;
+#pragma unroll
+                for (int local_i = 0; local_i < block_size; local_i++) {
+                    // use "infinity" as sentinel for minimum computations
+                    local_values[local_i] =
+                        local_i + i < size ? values[local_i + i] : infinity;
+                    if (local_values[local_i] < local_values[argmin]) {
+                        argmin = local_i;
+                    }
+                }
+                const auto tree_number = lut->compute_tree_index(local_values);
+                const auto min = local_values[argmin];
+                block_argmin.set(block_idx, argmin);
+                block_min[block_idx] = min;
+                block_tree_index[block_idx] =
+                    static_cast<tree_index_type>(tree_number);
+            }
+        },
+        ceildiv(ceildiv(size, block_size), collation_width), values,
+        block_argmin, block_min, block_tree_index, lut.get(), size);
+#endif
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL);
+
+
+template <typename IndexType>
+void compute_lookup_across_blocks(
+    std::shared_ptr<const DefaultExecutor> exec, const IndexType* block_min,
+    IndexType num_blocks,
+    device_range_minimum_query_superblocks<IndexType>& superblocks)
+{
+#ifdef GKO_COMPILING_DPCPP
+    GKO_NOT_IMPLEMENTED;
+#else
+    if (num_blocks < 2) {
+        return;
+    }
+    using superblock_type = device_range_minimum_query_superblocks<IndexType>;
+    using storage_type = typename superblock_type::storage_type;
+    // we need to collate all writes that target the same memory word in a
+    // single thread
+    constexpr auto level0_collation_width = sizeof(storage_type) * CHAR_BIT;
+    // initialize the first level of blocks
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto collated_i, auto block_min, auto superblocks,
+                      auto num_blocks) {
+            constexpr auto infinity = std::numeric_limits<IndexType>::max();
+            for (auto i = collated_i * level0_collation_width;
+                 i < std::min<int64>((collated_i + 1) * level0_collation_width,
+                                     num_blocks);
+                 i++) {
+                const auto min1 = block_min[i];
+                const auto min2 =
+                    i + 1 < num_blocks ? block_min[i + 1] : infinity;
+                // we need to use <= here to make sure ties always break to the
+                // left
+                superblocks.set_block_argmin(0, i, min1 <= min2 ? 0 : 1);
+            }
+        },
+        ceildiv(num_blocks, level0_collation_width), block_min, superblocks,
+        num_blocks);
+    // we computed argmins for blocks of size 2, now recursively combine them.
+    const auto num_levels = superblocks.num_levels();
+    for (int block_level = 1; block_level < num_levels; block_level++) {
+        const auto block_size =
+            superblock_type::block_size_for_level(block_level);
+        // we need block_level + 1 bits to represent values of size block_size
+        // and round up to the next power of two
+        const auto collation_width =
+            level0_collation_width / round_up_pow2(block_level + 1);
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto collated_i, auto block_level, auto block_min,
+                          auto superblocks, auto num_blocks,
+                          auto collation_width) {
+                const auto block_size =
+                    superblock_type::block_size_for_level(block_level);
+                for (auto i = collated_i * collation_width;
+                     i < std::min<int64>((collated_i + 1) * collation_width,
+                                         num_blocks);
+                     i++) {
+                    const auto i2 = i + block_size / 2;
+                    const auto argmin1 =
+                        i + superblocks.block_argmin(block_level - 1, i);
+                    const auto argmin2 =
+                        i2 < num_blocks
+                            ? i2 + superblocks.block_argmin(block_level - 1, i2)
+                            : argmin1;
+                    const auto min1 = block_min[argmin1];
+                    const auto min2 = block_min[argmin2];
+                    // we need to use <= here to make sure
+                    // ties always break to the left
+                    superblocks.set_block_argmin(
+                        block_level, i,
+                        min1 <= min2 ? argmin1 - i : argmin2 - i);
+                }
+            },
+            ceildiv(num_blocks, collation_width), block_level, block_min,
+            superblocks, num_blocks, collation_width);
+    }
+#endif
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_LARGE_KERNEL);
+
+
+}  // namespace range_minimum_query
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
@@ -42,6 +42,7 @@ target_sources(${ginkgo_core}
     base/segmented_array.cpp
     base/timer.cpp
     base/version.cpp
+    components/range_minimum_query.cpp
     config/config.cpp
     config/config_helper.cpp
     config/property_tree.cpp

diff --git a/core/base/array.cpp b/core/base/array.cpp
@@ -1,9 +1,11 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "ginkgo/core/base/array.hpp"
 
+#include <type_traits>
+
 #include <ginkgo/core/base/math.hpp>
 
 #include "core/base/array_access.hpp"
@@ -89,6 +91,11 @@ ValueType reduce_add(const array<ValueType>& input_arr,
 #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)
 
 GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
+template GKO_DECLARE_ARRAY_FILL(uint16);
+template GKO_DECLARE_ARRAY_FILL(uint32);
+#ifndef GKO_SIZE_T_IS_UINT64_T
+template GKO_DECLARE_ARRAY_FILL(uint64);
+#endif
 
 
 #define GKO_DECLARE_ARRAY_REDUCE_ADD(_type) \

diff --git a/core/base/batch_instantiation.hpp b/core/base/batch_instantiation.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause