Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuco::bloom_filter #101

Closed
wants to merge 36 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
2e68593
Added bloom_filter with example and benchmarks.
sleeepyjack Aug 9, 2021
f8a4722
Add function to (re-)initialize the filter.
sleeepyjack Aug 18, 2021
ce83891
Benchmarks refactored. Added benchmark for L2 resident filter.
sleeepyjack Aug 18, 2021
4899219
Unit tests for bloom filter added.
sleeepyjack Aug 18, 2021
5301bb2
Add missing const specifier for device-side contains operation.
sleeepyjack Aug 18, 2021
aadfaab
Fix num_bits and num_slots calculation.
sleeepyjack Aug 18, 2021
2ca505f
Fix for key pattern computation. Reduces FPR by a factor of ~10.
sleeepyjack Aug 19, 2021
0634ef7
Benchmark analysis notebook added.
sleeepyjack Aug 24, 2021
7bc30c9
Added helper functions for L2 residency control.
sleeepyjack Aug 25, 2021
8faf891
Add function for determining optimal grid size.
sleeepyjack Sep 5, 2021
370f11c
Extended bloom filter benchmarks.
sleeepyjack Sep 5, 2021
2f1fd6b
Merge remote-tracking branch 'upstream/dev' into bloom-filter
sleeepyjack Jul 13, 2022
aea31da
Remove output write buffer.
sleeepyjack Jul 13, 2022
a1c4138
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jul 13, 2022
a0b8684
Add missing doxygen docs.
sleeepyjack Jul 13, 2022
61e06de
Move tests/util.hpp -> tests/utils.hpp.
sleeepyjack Jul 13, 2022
c101801
Update copyright headers.
sleeepyjack Jul 13, 2022
42ced35
Specify kernel launch bounds.
sleeepyjack Jul 18, 2022
e31a050
Reorder and add missing includes.
sleeepyjack Jul 18, 2022
0f32782
Generate key patterns using extended double hashing.
sleeepyjack Jul 19, 2022
bc50bec
Merge branch 'feature/config_file' into bloom-filter
sleeepyjack Jul 21, 2022
748ee64
Move CUCO_HAS_CUDA_ANNOTATED_PTR to detail/__config.
sleeepyjack Jul 21, 2022
f4fbfa4
New Bloom filter benchmarks.
sleeepyjack Jul 28, 2022
a1f86f1
Remove external definition of NVBENCH_MODULE.
sleeepyjack Jul 29, 2022
faa5e85
Remove outdated L2 residency control helper.
sleeepyjack Jul 29, 2022
5578405
Merge branch 'dev' into bloom-filter
sleeepyjack Jul 29, 2022
602be1b
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jul 29, 2022
4f9acd7
Replace grid.num_threads() with grid.size(). (Backport for CUDA 11.0)
sleeepyjack Jul 29, 2022
efc2998
More descriptive names for examples.
sleeepyjack Aug 1, 2022
51189e4
Rename Bloom filter example to comply to the new naming scheme for ex…
sleeepyjack Aug 1, 2022
75a127e
Add stream param to Bloom filter ctor.
sleeepyjack Aug 1, 2022
2e5d6c3
Use new CUCO_HAS_INDEPENDENT_THREADS macro.
sleeepyjack Aug 1, 2022
a1ea293
Add helper functions for L2 residency control.
sleeepyjack Aug 1, 2022
6b61279
Add usage example for an L2-resident Bloom filter.
sleeepyjack Aug 1, 2022
d3f825f
Use L2 residency control helper functions in benchmark script.
sleeepyjack Aug 1, 2022
5c726e7
Merge remote-tracking branch 'upstream/dev' into bloom-filter
sleeepyjack Aug 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,7 @@ ConfigureNVBench(RETRIEVE_BENCH
# - reduce_by_key benchmarks ----------------------------------------------------------------------
set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")

###################################################################################################
ConfigureNVBench(BLOOM_FILTER_BENCH
"bloom_filter/bloom_filter_bench.cu")
261 changes: 261 additions & 0 deletions benchmarks/analysis/notebooks/bloom_filter_bench.ipynb

Large diffs are not rendered by default.

307 changes: 307 additions & 0 deletions benchmarks/bloom_filter/bloom_filter_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuco/bloom_filter.cuh>
#include <cuco/detail/cache_residency_control.cuh>

#include <nvbench/nvbench.cuh>

#include <cuda/std/atomic>

#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>

#include <cstddef>

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

static constexpr nvbench::int64_t block_size = 256;
static constexpr nvbench::int64_t stride = 4;

enum class FilterOperation { INSERT, CONTAINS };

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
FilterOperation,
[](FilterOperation op) {
switch (op) {
case FilterOperation::INSERT: return "INSERT";
case FilterOperation::CONTAINS: return "CONTAINS";
default: return "ERROR";
}
},
[](FilterOperation op) {
switch (op) {
case FilterOperation::INSERT: return "FilterOperation::INSERT";
case FilterOperation::CONTAINS: return "FilterOperation::CONTAINS";
default: return "ERROR";
}
})

enum class FilterScope { GMEM, L2 };

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
FilterScope,
[](FilterScope s) {
switch (s) {
case FilterScope::GMEM: return "GMEM";
case FilterScope::L2: return "L2";
default: return "ERROR";
}
},
[](FilterScope s) {
switch (s) {
case FilterScope::GMEM: return "FilterScope::GMEM";
case FilterScope::L2: return "FilterScope::L2";
default: return "ERROR";
}
})

enum class DataScope { GMEM, REGS };

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
DataScope,
[](DataScope s) {
switch (s) {
case DataScope::GMEM: return "GMEM";
case DataScope::REGS: return "REGS";
default: return "ERROR";
}
},
[](DataScope s) {
switch (s) {
case DataScope::GMEM: return "DataScope::GMEM";
case DataScope::REGS: return "DataScope::REGS";
default: return "ERROR";
}
})

template <typename Key, typename Slot>
void add_size_summary(nvbench::state& state)
{
using filter_type =
cuco::bloom_filter<Key, cuda::thread_scope_device, cuco::cuda_allocator<char>, Slot>;

auto const num_keys = state.get_int64("NumInputs");
auto const num_bits = state.get_int64("NumBits");
auto const num_hashes = state.get_int64("NumHashes");

filter_type filter(num_bits, num_hashes);

auto& summ = state.add_summary("nv/filter/size/mb");
summ.set_string("hint", "FilterMB");
summ.set_string("short_name", "FilterMB");
summ.set_string("description", "Size of the Bloom filter in MB.");
summ.set_float64("value", filter.get_num_slots() * sizeof(Slot) / 1000 / 1000);
}

template <typename Key, typename Slot>
void add_fpr_summary(nvbench::state& state)
{
using filter_type =
cuco::bloom_filter<Key, cuda::thread_scope_device, cuco::cuda_allocator<char>, Slot>;

auto const num_keys = state.get_int64("NumInputs");
auto const num_bits = state.get_int64("NumBits");
auto const num_hashes = state.get_int64("NumHashes");

thrust::device_vector<Key> keys(num_keys * 2);
thrust::sequence(thrust::device, keys.begin(), keys.end(), 1);
thrust::device_vector<bool> result(num_keys, false);

auto tp_begin = keys.begin();
auto tp_end = tp_begin + num_keys;
auto tn_begin = tp_end;
auto tn_end = keys.end();

filter_type filter(num_bits, num_hashes);
filter.insert(tp_begin, tp_end);
filter.contains(tn_begin, tn_end, result.begin());

float fp = thrust::count(thrust::device, result.begin(), result.end(), true);

auto& summ = state.add_summary("nv/filter/fpr");
summ.set_string("hint", "FPR");
summ.set_string("short_name", "FPR");
summ.set_string("description", "False-positive rate of the bloom filter.");
summ.set_float64("value", fp / num_keys);
}

template <nvbench::int64_t BLOCK_SIZE, typename Filter, typename InputIt>
__global__ void __launch_bounds__(BLOCK_SIZE)
insert_kernel(Filter mutable_view, InputIt first, InputIt last)
{
std::size_t tid = block_size * blockIdx.x + threadIdx.x;
auto it = first + tid;

while (it < last) {
mutable_view.insert(*it);
it += gridDim.x * BLOCK_SIZE;
}
}

template <nvbench::int64_t BLOCK_SIZE, typename Filter, typename InputIt, typename OutputIt>
__global__ void __launch_bounds__(BLOCK_SIZE)
contains_kernel(Filter view, InputIt first, InputIt last, OutputIt results)
{
std::size_t tid = block_size * blockIdx.x + threadIdx.x;

while ((first + tid) < last) {
*(results + tid) = view.contains(*(first + tid));
tid += gridDim.x * BLOCK_SIZE;
}
}

template <nvbench::int64_t BLOCK_SIZE, typename Filter>
__global__ void __launch_bounds__(BLOCK_SIZE)
insert_kernel(Filter mutable_view, nvbench::int64_t num_keys)
{
using key_type = typename Filter::key_type;

auto g = cg::this_grid();

for (key_type key = g.thread_rank(); key < num_keys; key += g.size()) {
mutable_view.insert(key);
}
}

template <nvbench::int64_t BLOCK_SIZE, typename Filter>
__global__ void __launch_bounds__(BLOCK_SIZE)
contains_kernel(Filter view, nvbench::int64_t num_keys)
{
using key_type = typename Filter::key_type;

auto g = cg::this_grid();

for (key_type key = g.thread_rank(); key < num_keys; key += g.size()) {
volatile bool contains = view.contains(key);
}
}

template <typename Key, typename Slot, FilterOperation Op, FilterScope FScope, DataScope DScope>
void nvbench_cuco_bloom_filter(nvbench::state& state,
nvbench::type_list<Key,
Slot,
nvbench::enum_type<Op>,
nvbench::enum_type<FScope>,
nvbench::enum_type<DScope>>)
{
auto num_keys = state.get_int64("NumInputs");
auto num_bits = state.get_int64("NumBits");
auto num_hashes = state.get_int64("NumHashes");

[[maybe_unused]] thrust::device_vector<Key> keys;
[[maybe_unused]] thrust::device_vector<bool> results;

if constexpr (DScope == DataScope::GMEM) {
keys.resize(num_keys);
thrust::sequence(thrust::device, keys.begin(), keys.end(), 1);

if constexpr (Op == FilterOperation::CONTAINS) { results.resize(num_keys); }
}

using filter_type =
cuco::bloom_filter<Key, cuda::thread_scope_device, cuco::cuda_allocator<char>, Slot>;

filter_type filter(num_bits, num_hashes);
auto mutable_view = filter.get_device_mutable_view();
auto view = filter.get_device_view();
std::size_t const grid_size = SDIV(num_keys, stride * block_size);

state.add_element_count(num_keys);
state.add_global_memory_writes<Slot>(num_keys);

add_fpr_summary<Key, Slot>(state);
add_size_summary<Key, Slot>(state);

if constexpr (Op == FilterOperation::CONTAINS) {
insert_kernel<block_size><<<grid_size, block_size>>>(mutable_view, num_keys);
}

cudaStream_t stream;
cudaStreamCreate(&stream);

if constexpr (FScope == FilterScope::L2)
cuco::register_l2_persistence(
stream, filter.get_slots(), filter.get_slots() + filter.get_num_slots());

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));

state.exec([&](nvbench::launch& launch) {
if constexpr (Op == FilterOperation::INSERT) {
filter.initialize(launch.get_stream());
if constexpr (DScope == DataScope::GMEM) {
insert_kernel<block_size><<<grid_size, block_size, 0, launch.get_stream()>>>(
mutable_view, keys.begin(), keys.end());
}
if constexpr (DScope == DataScope::REGS) {
insert_kernel<block_size>
<<<grid_size, block_size, 0, launch.get_stream()>>>(mutable_view, num_keys);
}
}
if constexpr (Op == FilterOperation::CONTAINS) {
if constexpr (DScope == DataScope::GMEM) {
contains_kernel<block_size><<<grid_size, block_size, 0, launch.get_stream()>>>(
view, keys.begin(), keys.end(), results.begin());
}
if constexpr (DScope == DataScope::REGS) {
contains_kernel<block_size>
<<<grid_size, block_size, 0, launch.get_stream()>>>(view, num_keys);
}
}
});

if constexpr (FScope == FilterScope::L2) cuco::unregister_l2_persistence(stream);
}

using key_type_range = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
using slot_type_range = nvbench::type_list<nvbench::int32_t, nvbench::uint64_t>;
using op_range = nvbench::enum_type_list<FilterOperation::INSERT, FilterOperation::CONTAINS>;
using filter_scope_range = nvbench::enum_type_list<FilterScope::GMEM, FilterScope::L2>;
using data_scope_range = nvbench::enum_type_list<DataScope::GMEM, DataScope::REGS>;

// A100 L2 = 40MB ~ 330'000'000 bits
// smem = 48kb ~ 390'0000 bits
// 1GB ~ 8'500'000'000 bits
// 4GB ~ 34'000'000'000 bits

NVBENCH_BENCH_TYPES(nvbench_cuco_bloom_filter,
NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
nvbench::type_list<nvbench::int64_t>,
op_range,
filter_scope_range,
data_scope_range))
.set_name("cuco_bloom_filter_l2")
.set_type_axes_names({"KeyType", "SlotType", "FilterOperation", "FilterScope", "DataScope"})
.set_max_noise(3)
.add_int64_axis("NumInputs", {10'000'000, 100'000'000})
.add_int64_axis("NumBits", {300'000'000})
.add_int64_axis("NumHashes", {2});

NVBENCH_BENCH_TYPES(nvbench_cuco_bloom_filter,
NVBENCH_TYPE_AXES(key_type_range,
slot_type_range,
op_range,
nvbench::enum_type_list<FilterScope::GMEM>,
data_scope_range))
.set_name("cuco_bloom_filter_gmem")
.set_type_axes_names({"KeyType", "SlotType", "FilterOperation", "FilterScope", "DataScope"})
.set_max_noise(3)
.add_int64_axis("NumInputs", {1'000'000'000, 100'000'000})
.add_int64_axis("NumBits", {8'500'000'000, 34'000'000'000})
.add_int64_axis("NumHashes", {6});
2 changes: 2 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/stati
ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu")
ConfigureExample(BLOOM_FILTER_L2_RESIDENCY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/l2_residency_example.cu")

foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
if("${arch}" MATCHES "^6")
Expand Down
68 changes: 68 additions & 0 deletions examples/bloom_filter/host_bulk_example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuco/bloom_filter.cuh>

#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sequence.h>

#include <iostream>

int main(void)
{
// Generate 10'000 keys and insert the first 5'000 into the filter.
int const num_keys = 10'000;
int const num_tp = num_keys * 0.5;
int const num_tn = num_keys - num_tp;

// Spawn a filter with 1'000'000 bits and 6-bit patterns for each key.
cuco::bloom_filter<int> filter{num_tp * 10, 6};

thrust::device_vector<int> keys(num_keys);
thrust::sequence(keys.begin(), keys.end(), 1);

auto tp_begin = keys.begin();
auto tp_end = tp_begin + num_tp;
auto tn_begin = tp_end;
auto tn_end = keys.end();

// Insert the first half of the keys.
filter.insert(tp_begin, tp_end);

thrust::device_vector<bool> tp_result(num_tp, false);
thrust::device_vector<bool> tn_result(num_keys - num_tp, false);

// Query the filter for the previously inserted keys.
// This should result in a true-positive rate of TPR=1.
filter.contains(tp_begin, tp_end, tp_result.begin());

// Query the filter for the keys that are not present in the filter.
// Since bloom filters are probalistic data structures, the filter
// exhibits a false-positive rate FPR>0 depending on the number of bits in
// the filter and the number of hashes used per key.
filter.contains(tn_begin, tn_end, tn_result.begin());

float tp_rate =
float(thrust::count(thrust::device, tp_result.begin(), tp_result.end(), true)) / float(num_tp);
float fp_rate =
float(thrust::count(thrust::device, tn_result.begin(), tn_result.end(), true)) / float(num_tn);

std::cout << "TPR=" << tp_rate << " FPR=" << fp_rate << std::endl;

return 0;
}
Loading