Skip to content

Commit

Permalink
Merge range minimum query kernels (#1764)
Browse files Browse the repository at this point in the history
This adds kernels for building a range minimum query data structure and support for querying it from device kernels.
They will be used for symbolic Cholesky LCA queries.

Related PR: #1764
  • Loading branch information
upsj authored Jan 20, 2025
2 parents 0b3436e + aca9c69 commit efbf245
Show file tree
Hide file tree
Showing 32 changed files with 2,488 additions and 20 deletions.
3 changes: 2 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ build/cuda110/nompi/gcc/cuda/release/shared:
# fix gtest issue https://github.com/google/googletest/issues/3514
CXX_FLAGS: "-Wno-error=maybe-uninitialized"
# disable spurious unused argument warning
EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
# this is seemingly broken with CUDA 11
# EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"


# nvhpc and friends
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ include(cmake/build_type_helpers.cmake)
# Load other CMake helpers
include(cmake/build_helpers.cmake)
include(cmake/install_helpers.cmake)
include(cmake/compiler_features.cmake)

if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
Expand Down
8 changes: 8 additions & 0 deletions cmake/compiler_features.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
include(CheckCXXSourceCompiles)
check_cxx_source_compiles(
"#include <type_traits>
#include <cstdint>
static_assert(std::is_same<std::uint64_t, std::size_t>::value, \"INSTANTIATE_UINT64\");
int main() {}"
GKO_SIZE_T_IS_UINT64_T
FAIL_REGEX ".*INSTANTIATE_UINT64.#")
8 changes: 4 additions & 4 deletions cmake/create_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -273,25 +273,25 @@ endfunction(ginkgo_create_common_test_internal)
function(ginkgo_create_common_device_test test_name)
cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}")
ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
if(GINKGO_BUILD_SYCL)
if(GINKGO_BUILD_SYCL AND NOT ("dpcpp" IN_LIST common_device_test_DISABLE_EXECUTORS))
ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN})
target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS})
# We need to use a new file to avoid sycl setting in other backends because add_sycl_to_target will change the source property.
configure_file(${test_name}.cpp ${test_name}.dp.cpp COPYONLY)
gko_add_sycl_to_target(TARGET ${test_target_name}_dpcpp SOURCES ${test_name}.dp.cpp)
target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel)
endif()
if(GINKGO_BUILD_OMP)
if(GINKGO_BUILD_OMP AND NOT ("omp" IN_LIST common_device_test_DISABLE_EXECUTORS))
ginkgo_create_common_test_internal(${test_name} OmpExecutor omp ${ARGN})
target_link_libraries(${test_target_name}_omp PUBLIC OpenMP::OpenMP_CXX)
endif()
if(GINKGO_BUILD_CUDA)
if(GINKGO_BUILD_CUDA AND NOT ("cuda" IN_LIST common_device_test_DISABLE_EXECUTORS))
# need to make a separate file for this, since we can't set conflicting properties on the same file
configure_file(${test_name}.cpp ${test_name}.cu COPYONLY)
ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN})
target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor GKO_DEVICE_NAMESPACE=cuda)
endif()
if(GINKGO_BUILD_HIP)
if(GINKGO_BUILD_HIP AND NOT ("hip" IN_LIST common_device_test_DISABLE_EXECUTORS))
# need to make a separate file for this, since we can't set conflicting properties on the same file
configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY)
ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN})
Expand Down
1 change: 1 addition & 0 deletions common/unified/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set(UNIFIED_SOURCES
components/fill_array_kernels.cpp
components/format_conversion_kernels.cpp
components/precision_conversion_kernels.cpp
components/range_minimum_query_kernels.cpp
components/reduce_array_kernels.cpp
distributed/assembly_kernels.cpp
distributed/partition_helpers_kernels.cpp
Expand Down
9 changes: 8 additions & 1 deletion common/unified/components/fill_array_kernels.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "core/components/fill_array_kernels.hpp"

#include <type_traits>

#include "common/unified/base/kernel_launch.hpp"


Expand All @@ -25,6 +27,11 @@ void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType* array,

GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
template GKO_DECLARE_FILL_ARRAY_KERNEL(bool);
template GKO_DECLARE_FILL_ARRAY_KERNEL(uint16);
template GKO_DECLARE_FILL_ARRAY_KERNEL(uint32);
#ifndef GKO_SIZE_T_IS_UINT64_T
template GKO_DECLARE_FILL_ARRAY_KERNEL(uint64);
#endif


template <typename ValueType>
Expand Down
184 changes: 184 additions & 0 deletions common/unified/components/range_minimum_query_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "core/components/range_minimum_query_kernels.hpp"

#include <limits>

#include "common/unified/base/kernel_launch.hpp"
#include "core/base/intrinsics.hpp"
#include "core/components/bit_packed_storage.hpp"
#include "core/components/range_minimum_query.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace range_minimum_query {


template <typename IndexType>
void compute_lookup_inside_blocks(
std::shared_ptr<const DefaultExecutor> exec, const IndexType* values,
IndexType size, bit_packed_span<int, IndexType, uint32>& block_argmin,
IndexType* block_min, uint16* block_tree_index)
{
#ifdef GKO_COMPILING_DPCPP
// The Intel SYCL compiler doesn't support constexpr initialization of
// non-trivial objects on the device.
GKO_NOT_IMPLEMENTED;
#else
using rmq_type = gko::range_minimum_query<IndexType>;
constexpr auto block_size = rmq_type::block_size;
using tree_index_type = std::decay_t<decltype(*block_tree_index)>;
using device_lut_type = typename rmq_type::block_lut_type;
using lut_type = typename rmq_type::block_lut_view_type;
static_assert(
lut_type::num_trees <= std::numeric_limits<tree_index_type>::max(),
"block type storage too small");
// block_argmin stores multiple values per memory word, so we need to make
// sure that no two different threads write to the same memory location.
// The easiest way to do that is to have every thread handle all elements
// that map to the same memory location.
// The argmin inside a block is in the range [0, block_size - 1], so
// it needs ceil_log2_constexpr(block_size) bits. For efficiency
// reasons, we round that up to the next power of two.
// This expression is essentially bits_per_word /
// round_up_pow2_constexpr(ceil_log2_constexpr(block_size)), i.e. how
// many values are stored per word.
constexpr auto collation_width =
1 << (std::decay_t<decltype(block_argmin)>::bits_per_word_log2 -
ceil_log2_constexpr(ceil_log2_constexpr(block_size)));
const device_lut_type lut{exec};
run_kernel(
exec,
[] GKO_KERNEL(auto collated_block_idx, auto values, auto block_argmin,
auto block_min, auto block_tree_index, auto lut,
auto size) {
// we need to put this here because some compilers interpret capture
// rules around constexpr incorrectly
constexpr auto block_size = rmq_type::block_size;
constexpr auto infinity = std::numeric_limits<IndexType>::max();
const auto num_blocks = ceildiv(size, block_size);
for (auto block_idx = collated_block_idx * collation_width;
block_idx <
std::min<int64>((collated_block_idx + 1) * collation_width,
num_blocks);
block_idx++) {
const auto i = block_idx * block_size;
IndexType local_values[block_size];
int argmin = 0;
#pragma unroll
for (int local_i = 0; local_i < block_size; local_i++) {
// use "infinity" as sentinel for minimum computations
local_values[local_i] =
local_i + i < size ? values[local_i + i] : infinity;
if (local_values[local_i] < local_values[argmin]) {
argmin = local_i;
}
}
const auto tree_number = lut->compute_tree_index(local_values);
const auto min = local_values[argmin];
block_argmin.set(block_idx, argmin);
block_min[block_idx] = min;
block_tree_index[block_idx] =
static_cast<tree_index_type>(tree_number);
}
},
ceildiv(ceildiv(size, block_size), collation_width), values,
block_argmin, block_min, block_tree_index, lut.get(), size);
#endif
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL);


template <typename IndexType>
void compute_lookup_across_blocks(
std::shared_ptr<const DefaultExecutor> exec, const IndexType* block_min,
IndexType num_blocks,
device_range_minimum_query_superblocks<IndexType>& superblocks)
{
#ifdef GKO_COMPILING_DPCPP
GKO_NOT_IMPLEMENTED;
#else
if (num_blocks < 2) {
return;
}
using superblock_type = device_range_minimum_query_superblocks<IndexType>;
using storage_type = typename superblock_type::storage_type;
// we need to collate all writes that target the same memory word in a
// single thread
constexpr auto level0_collation_width = sizeof(storage_type) * CHAR_BIT;
// initialize the first level of blocks
run_kernel(
exec,
[] GKO_KERNEL(auto collated_i, auto block_min, auto superblocks,
auto num_blocks) {
constexpr auto infinity = std::numeric_limits<IndexType>::max();
for (auto i = collated_i * level0_collation_width;
i < std::min<int64>((collated_i + 1) * level0_collation_width,
num_blocks);
i++) {
const auto min1 = block_min[i];
const auto min2 =
i + 1 < num_blocks ? block_min[i + 1] : infinity;
// we need to use <= here to make sure ties always break to the
// left
superblocks.set_block_argmin(0, i, min1 <= min2 ? 0 : 1);
}
},
ceildiv(num_blocks, level0_collation_width), block_min, superblocks,
num_blocks);
// we computed argmins for blocks of size 2, now recursively combine them.
const auto num_levels = superblocks.num_levels();
for (int block_level = 1; block_level < num_levels; block_level++) {
const auto block_size =
superblock_type::block_size_for_level(block_level);
// we need block_level + 1 bits to represent values of size block_size
// and round up to the next power of two
const auto collation_width =
level0_collation_width / round_up_pow2(block_level + 1);
run_kernel(
exec,
[] GKO_KERNEL(auto collated_i, auto block_level, auto block_min,
auto superblocks, auto num_blocks,
auto collation_width) {
const auto block_size =
superblock_type::block_size_for_level(block_level);
for (auto i = collated_i * collation_width;
i < std::min<int64>((collated_i + 1) * collation_width,
num_blocks);
i++) {
const auto i2 = i + block_size / 2;
const auto argmin1 =
i + superblocks.block_argmin(block_level - 1, i);
const auto argmin2 =
i2 < num_blocks
? i2 + superblocks.block_argmin(block_level - 1, i2)
: argmin1;
const auto min1 = block_min[argmin1];
const auto min2 = block_min[argmin2];
// we need to use <= here to make sure
// ties always break to the left
superblocks.set_block_argmin(
block_level, i,
min1 <= min2 ? argmin1 - i : argmin2 - i);
}
},
ceildiv(num_blocks, collation_width), block_level, block_min,
superblocks, num_blocks, collation_width);
}
#endif
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_LARGE_KERNEL);


} // namespace range_minimum_query
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
1 change: 1 addition & 0 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ target_sources(${ginkgo_core}
base/segmented_array.cpp
base/timer.cpp
base/version.cpp
components/range_minimum_query.cpp
config/config.cpp
config/config_helper.cpp
config/property_tree.cpp
Expand Down
9 changes: 8 additions & 1 deletion core/base/array.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "ginkgo/core/base/array.hpp"

#include <type_traits>

#include <ginkgo/core/base/math.hpp>

#include "core/base/array_access.hpp"
Expand Down Expand Up @@ -89,6 +91,11 @@ ValueType reduce_add(const array<ValueType>& input_arr,
#define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)

GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
template GKO_DECLARE_ARRAY_FILL(uint16);
template GKO_DECLARE_ARRAY_FILL(uint32);
#ifndef GKO_SIZE_T_IS_UINT64_T
template GKO_DECLARE_ARRAY_FILL(uint64);
#endif


#define GKO_DECLARE_ARRAY_REDUCE_ADD(_type) \
Expand Down
2 changes: 1 addition & 1 deletion core/base/batch_instantiation.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down
Loading

0 comments on commit efbf245

Please sign in to comment.