From 6ec7f84e9a407aae378624937c37850a603ca161 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Sat, 28 Dec 2024 11:54:49 -0800 Subject: [PATCH] Adapt bitset word size (#1554) --- src/corecel/cont/Bitset.hh | 47 +++++++++++++++++++++++++--------- src/corecel/math/Algorithms.hh | 27 +++++++++++++++---- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/src/corecel/cont/Bitset.hh b/src/corecel/cont/Bitset.hh index 6925715802..249c7e6dd9 100644 --- a/src/corecel/cont/Bitset.hh +++ b/src/corecel/cont/Bitset.hh @@ -8,6 +8,8 @@ #pragma once #include +#include +#include #include "corecel/Config.hh" @@ -27,7 +29,7 @@ namespace celeritas * for our current use case. Given that GPU typically use 32-bit words, this * uses unsigned int as the word type instead of the unsigned long used by the * standard library. This container is not thread-safe, multiple threads are - * likely to manipulate the same word, even we accessing different indices. + * likely to manipulate the same word, even when accessing different indices. * * The following methods are not implemented: * - conversions to string, to_ulong, to_ullong @@ -44,7 +46,10 @@ class Bitset public: //!@{ //! \name Type aliases - using word_type = unsigned int; + using word_type = std::conditional_t< + (N <= 8), + std::uint8_t, + std::conditional_t<(N <= 16), std::uint16_t, size_type>>; //!@} class reference; @@ -53,7 +58,7 @@ class Bitset //// CONSTRUCTORS //// // Default construct with zeros for all bits - CELER_CONSTEXPR_FUNCTION Bitset() = default; + constexpr Bitset() = default; // Construct implicitly from a bitset encoded as an integer CELER_CONSTEXPR_FUNCTION Bitset(word_type value) noexcept; @@ -149,6 +154,9 @@ class Bitset // Create a mask for a given bit index static CELER_CONSTEXPR_FUNCTION word_type mask(size_type pos) noexcept; + // Create a negative mask for a given bit index + static CELER_CONSTEXPR_FUNCTION word_type neg_mask(size_type pos) noexcept; + // Get the word for a given bit position CELER_CONSTEXPR_FUNCTION word_type get_word(size_type pos) const noexcept(!CELERITAS_DEBUG); @@ -185,9 +193,9 @@ class Bitset::reference { } - CELER_CONSTEXPR_FUNCTION reference(reference const&) = default; + constexpr reference(reference const&) = default; - CELER_FUNCTION ~reference() noexcept = default; + ~reference() noexcept = default; //! Assignment for b[i] = x; CELER_CONSTEXPR_FUNCTION @@ -199,7 +207,7 @@ class Bitset::reference } else { - *word_pointer_ &= ~Bitset::mask(bit_pos_); + *word_pointer_ &= Bitset::neg_mask(bit_pos_); } return *this; } @@ -216,7 +224,7 @@ class Bitset::reference } else { - *word_pointer_ &= ~Bitset::mask(bit_pos_); + *word_pointer_ &= Bitset::neg_mask(bit_pos_); } } return *this; @@ -313,7 +321,7 @@ CELER_CONSTEXPR_FUNCTION bool Bitset::all() const noexcept { for (size_type i = 0; i < num_words_ - 1; ++i) { - if (words_[i] != ~word_type(0)) + if (words_[i] != static_cast(~word_type(0))) { return false; } @@ -321,7 +329,8 @@ CELER_CONSTEXPR_FUNCTION bool Bitset::all() const noexcept // Only compare the last word up to the last bit of the bitset return this->last_word() - == (~word_type(0) >> (num_words_ * bits_per_word_ - N)); + == (static_cast(~word_type(0)) + >> (num_words_ * bits_per_word_ - N)); } //---------------------------------------------------------------------------// @@ -416,7 +425,7 @@ CELER_CONSTEXPR_FUNCTION Bitset& Bitset::set() noexcept { for (size_type i = 0; i < num_words_; ++i) { - words_[i] = ~word_type(0); + words_[i] = static_cast(~word_type(0)); } // Clear unused bits on the last word @@ -456,7 +465,7 @@ CELER_CONSTEXPR_FUNCTION Bitset& Bitset::reset(size_type pos) noexcept(!CELERITAS_DEBUG) { CELER_EXPECT(pos < N); - this->get_word(pos) &= ~Bitset::mask(pos); + this->get_word(pos) &= Bitset::neg_mask(pos); return *this; } @@ -512,6 +521,19 @@ Bitset::mask(size_type pos) noexcept -> word_type return word_type(1) << Bitset::which_bit(pos); } +//---------------------------------------------------------------------------// +/*! + * Create a negative mask (a single 0 bit) for a given bit index. The purpose + * of this function is to cast a potentially promoted word_type (from ~) back + * to the original word_type. + */ +template +CELER_CONSTEXPR_FUNCTION auto +Bitset::neg_mask(size_type pos) noexcept -> word_type +{ + return ~(word_type(1) << Bitset::which_bit(pos)); +} + //---------------------------------------------------------------------------// //! Get the word for a given bit position template @@ -556,7 +578,8 @@ CELER_CONSTEXPR_FUNCTION void Bitset::sanitize() noexcept constexpr size_type extra_bits = N % bits_per_word_; if constexpr (extra_bits != 0) { - this->last_word() &= ~((~word_type(0)) << extra_bits); + this->last_word() &= static_cast( + ~(static_cast(~word_type(0)) << extra_bits)); } } diff --git a/src/corecel/math/Algorithms.hh b/src/corecel/math/Algorithms.hh index 64dbcd53de..1b4be1a9c9 100644 --- a/src/corecel/math/Algorithms.hh +++ b/src/corecel/math/Algorithms.hh @@ -738,18 +738,35 @@ CELER_FORCEINLINE_FUNCTION void sincospi(double a, double* s, double* c) /*! * Count the number of set bits in an integer. */ +template #if defined(_MSC_VER) -inline int popcount(unsigned int x) noexcept +inline int popcount(T x) noexcept #else -inline constexpr int popcount(unsigned int x) noexcept +inline constexpr int popcount(T x) noexcept #endif { + static_assert(sizeof(T) <= 8, + "popcount is only defined for 32-bit and 64-bit integers"); + static_assert(std::is_integral_v && std::is_unsigned_v, + "popcount is only defined for unsigned integral types"); + + if constexpr (sizeof(T) <= 4) + { +#if CELER_DEVICE_COMPILE + return __popc(x); +#elif defined(_MSC_VER) + return __popcnt(x); +#else + return __builtin_popcount(x); +#endif + } + #if CELER_DEVICE_COMPILE - return __popc(x); + return __popcll(x); #elif defined(_MSC_VER) - return __popcnt(x); + return __popcnt64(x); #else - return __builtin_popcount(x); + return __builtin_popcountl(x); #endif }