NVIDIA · sleeepyjack · Oct 2, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
@@ -242,4 +242,11 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/hyperloglog/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/G4qdcTezE))
 - [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/hyperloglog/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/n88713o4n))
 
+### `bloom_filter`
+
+`cuco::bloom_filter` implements a Blocked Bloom Filter for approximate set membership queries.
+
+#### Examples:
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EY7T5v5aE))
+
 
@@ -98,3 +98,9 @@ ConfigureBench(HASH_FUNCTION_BENCH
 # - hyperloglog benchmarks -----------------------------------------------------------
 ConfigureBench(HYPERLOGLOG_BENCH
   hyperloglog/hyperloglog_bench.cu)
+
+###################################################################################################
+# - bloom_filter benchmarks -----------------------------------------------------------------------
+ConfigureBench(BLOOM_FILTER_BENCH
+  bloom_filter/add_bench.cu
+  bloom_filter/contains_bench.cu)
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuco/hash_functions.cuh>
+
 #include <nvbench/nvbench.cuh>
 
 #include <cstdint>
@@ -25,6 +27,12 @@ namespace cuco::benchmark::defaults {
 
 using KEY_TYPE_RANGE   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
 using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using HASH_RANGE       = nvbench::type_list<cuco::identity_hash<char>,
+                                            cuco::xxhash_32<char>,
+                                            cuco::xxhash_64<char>,
+                                            cuco::murmurhash3_32<char>>;  //,
+// cuco::murmurhash3_x86_128<char>,
+// cuco::murmurhash3_x64_128<char>>; // TODO handle tuple-like hash value
 
 auto constexpr N             = 100'000'000;
 auto constexpr OCCUPANCY     = 0.5;

@@ -39,6 +39,17 @@ auto dist_from_state(nvbench::state const& state)
   }
 }
 
+template <typename T, typename NewType>
+struct rebind_hasher;
+
+template <template <typename> class Template, typename OldType, typename NewType>
+struct rebind_hasher<Template<OldType>, NewType> {
+  using type = Template<NewType>;
+};
+
+template <typename T, typename NewType>
+using rebind_hasher_t = typename rebind_hasher<T, NewType>::type;
+
 }  // namespace cuco::benchmark
 
 NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::unique, "UNIQUE", "distribution::unique");

@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "defaults.hpp"
+#include "utils.hpp"
+
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
+
+#include <cuco/bloom_filter.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <cuda/std/limits>
+#include <thrust/device_vector.h>
+
+#include <cstdint>
+#include <exception>
+
+using namespace cuco::benchmark;  // defaults, dist_from_state, rebind_hasher_t, add_fpr_summary
+using namespace cuco::utility;    // key_generator, distribution
+
+/**
+ * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance
+ */
+template <typename Key, typename Hash, typename Word, nvbench::int32_t WordsPerBlock, typename Dist>
+void bloom_filter_add(nvbench::state& state,
+                      nvbench::type_list<Key, Hash, Word, nvbench::enum_type<WordsPerBlock>, Dist>)
+{
+  using policy_type = cuco::bloom_filter_policy<rebind_hasher_t<Hash, Key>,
+                                                Word,
+                                                static_cast<std::uint32_t>(WordsPerBlock)>;
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;
+
+  auto const num_keys       = state.get_int64("NumInputs");
+  auto const filter_size_mb = state.get_int64("FilterSizeMB");
+  auto const pattern_bits   = state.get_int64("PatternBits");
+
+  try {
+    auto const policy = policy_type{static_cast<uint32_t>(pattern_bits)};
+  } catch (std::exception const& e) {
+    state.skip(e.what());  // skip invalid configurations
+  }
+
+  std::size_t const num_sub_filters =
+    (filter_size_mb * 1024 * 1024) /
+    (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
+
+  add_fpr_summary(state, filter);
+
+  state.exec([&](nvbench::launch& launch) {
+    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+  });
+}
+
+NVBENCH_BENCH_TYPES(bloom_filter_add,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                                      nvbench::type_list<defaults::BF_HASH>,
+                                      nvbench::type_list<defaults::BF_WORD>,
+                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("bloom_filter_add_unique_size")
+  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::BF_N})
+  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE)
+  .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});
+
+NVBENCH_BENCH_TYPES(bloom_filter_add,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                                      defaults::HASH_RANGE,
+                                      nvbench::type_list<defaults::BF_WORD>,
+                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("bloom_filter_add_unique_hash")
+  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::BF_N})
+  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB})
+  .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});
+
+NVBENCH_BENCH_TYPES(bloom_filter_add,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                                      nvbench::type_list<defaults::BF_HASH>,
+                                      nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>,
+                                      nvbench::enum_type_list<1, 2, 4, 8>,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("bloom_filter_add_unique_block_dim")
+  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::BF_N})
+  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB})
+  .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "defaults.hpp"
+#include "utils.hpp"
+
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
+
+#include <cuco/bloom_filter.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <cuda/std/limits>
+#include <thrust/device_vector.h>
+
+#include <exception>
+
+using namespace cuco::benchmark;  // defaults, dist_from_state, rebind_hasher_t, add_fpr_summary
+using namespace cuco::utility;    // key_generator, distribution
+
+/**
+ * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance
+ */
+template <typename Key, typename Hash, typename Word, nvbench::int32_t WordsPerBlock, typename Dist>
+void bloom_filter_contains(
+  nvbench::state& state,
+  nvbench::type_list<Key, Hash, Word, nvbench::enum_type<WordsPerBlock>, Dist>)
+{
+  // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
+  // filter block fits into a 32B sector
+  using policy_type = cuco::bloom_filter_policy<rebind_hasher_t<Hash, Key>,
+                                                Word,
+                                                static_cast<std::uint32_t>(WordsPerBlock)>;
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;
+
+  auto const num_keys       = state.get_int64("NumInputs");
+  auto const filter_size_mb = state.get_int64("FilterSizeMB");
+  auto const pattern_bits   = state.get_int64("PatternBits");
+
+  try {
+    auto const policy = policy_type{static_cast<uint32_t>(pattern_bits)};
+  } catch (std::exception const& e) {
+    state.skip(e.what());  // skip invalid configurations
+  }
+
+  std::size_t const num_sub_filters =
+    (filter_size_mb * 1024 * 1024) /
+    (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::device_vector<bool> result(num_keys, false);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
+
+  add_fpr_summary(state, filter);
+
+  filter.add(keys.begin(), keys.end());
+
+  state.exec([&](nvbench::launch& launch) {
+    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+  });
+}
+
+NVBENCH_BENCH_TYPES(bloom_filter_contains,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                                      nvbench::type_list<defaults::BF_HASH>,
+                                      nvbench::type_list<defaults::BF_WORD>,
+                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("bloom_filter_contains_unique_size")
+  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::BF_N})
+  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE)
+  .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});
+
+NVBENCH_BENCH_TYPES(bloom_filter_contains,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                                      defaults::HASH_RANGE,
+                                      nvbench::type_list<defaults::BF_WORD>,
+                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("bloom_filter_contains_unique_hash")
+  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::BF_N})
+  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB})
+  .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});
+
+NVBENCH_BENCH_TYPES(bloom_filter_contains,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                                      nvbench::type_list<defaults::BF_HASH>,
+                                      nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>,
+                                      nvbench::enum_type_list<1, 2, 4, 8>,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("bloom_filter_contains_unique_block_dim")
+  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::BF_N})
+  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB})
+  .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/hash_functions.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <cuda/std/array>
+
+#include <vector>
+
+namespace cuco::benchmark::defaults {
+
+using BF_KEY  = nvbench::int64_t;
+using BF_HASH = cuco::xxhash_64<char>;
+using BF_WORD = nvbench::uint32_t;
+
+static constexpr auto BF_N               = 400'000'000;
+static constexpr auto BF_SIZE_MB         = 2'000;
+static constexpr auto BF_WORDS_PER_BLOCK = 8;
+static constexpr auto BF_PATTERN_BITS    = BF_WORDS_PER_BLOCK;
+
+auto const BF_SIZE_MB_RANGE_CACHE =
+  std::vector<nvbench::int64_t>{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
+auto const BF_PATTERN_BITS_RANGE = std::vector<nvbench::int64_t>{1, 2, 4, 6, 8, 16};
+
+}  // namespace cuco::benchmark::defaults