From 7609d18cc7423558c03fc7e2dfcf6df487725c36 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 30 Jan 2025 17:54:55 -0500
Subject: [PATCH 1/4] Fix indexing bug when using parallelism to build CPU
 hierarchy in HNSW (#620)

hnswlib uses an internal indexing system which assigns an ID to points, atomically, in-order that they are added to the index. When using parallelism to add points to the index, the internal ID may be different than the "label" of the point (label, for us, is just the index of the row in the dataset) as a consequence of adding points in-parallel in no deterministic order.

The bug was that I was using the label itself to write out the CPU hierarchy, when I should have been using hnswlib's internal ID for the point associated with that label.

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/620
---
 cpp/include/cuvs/neighbors/hnsw.h             | 22 ++---
 cpp/include/cuvs/neighbors/hnsw.hpp           |  6 +-
 cpp/src/neighbors/detail/hnsw.hpp             | 84 ++++---------------
 cpp/src/neighbors/hnsw_c.cpp                  |  2 +-
 python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx      | 10 +--
 python/cuvs/cuvs/tests/test_hnsw.py           |  5 +-
 .../config/algos/cuvs_cagra_hnswlib.yaml      |  2 +-
 7 files changed, 38 insertions(+), 93 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/hnsw.h b/cpp/include/cuvs/neighbors/hnsw.h
index a7597a939..d88fd3b4e 100644
--- a/cpp/include/cuvs/neighbors/hnsw.h
+++ b/cpp/include/cuvs/neighbors/hnsw.h
@@ -51,9 +51,9 @@ struct cuvsHnswIndexParams {
   /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
   int ef_construction;
   /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
-  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
-  to parallelism, and increasing the number of threads can reduce the quality of the index.
-   */
+      When the value is 0, the number of threads is automatically determined to the maximum
+      number of threads available.
+  */
   int num_threads;
 };
 
@@ -158,8 +158,8 @@ cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params);
  * NOTE: When hierarchy is:
  *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
  * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
- * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
- * the format is not compatible with the original hnswlib.
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS,
+ * as the format is not compatible with the original hnswlib.
  *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
  * serialized index is also compatible with the original hnswlib library.
  *
@@ -364,10 +364,10 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res,
 
 /**
  * @brief Serialize a CAGRA index to a file as an hnswlib index
- * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
- * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
- * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
- * library.
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by
+ * the hnswlib wrapper in cuVS, as the serialization format is not compatible with the original
+ * hnswlib. However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the
+ * original hnswlib library.
  *
  * @param[in] res cuvsResources_t opaque C handle
  * @param[in] filename the name of the file to save the index
@@ -406,8 +406,8 @@ cuvsError_t cuvsHnswSerialize(cuvsResources_t res, const char* filename, cuvsHns
 /**
  * Load hnswlib index from file which was serialized from a HNSW index.
  * NOTE: When hierarchy is `NONE`, the loaded hnswlib index is immutable, and only be read by the
- * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
- * Experimental, both the API and the serialization format are subject to change.
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original
+ * hnswlib. Experimental, both the API and the serialization format are subject to change.
  *
  * @code{.c}
  * #include <cuvs/core/c_api.h>
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index 81a823493..750f1f87f 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -54,10 +54,10 @@ struct index_params : cuvs::neighbors::index_params {
   /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
   int ef_construction = 200;
   /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
-  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
-  to parallelism, and increasing the number of threads can reduce the quality of the index.
+      When the value is 0, the number of threads is automatically determined to the
+      maximum number of threads available.
    */
-  int num_threads = 2;
+  int num_threads = 0;
 };
 
 /**@}*/
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 5447ae07a..07e012349 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -21,65 +21,13 @@
 #include <hnswlib/hnswalg.h>
 #include <hnswlib/hnswlib.h>
 #include <memory>
+#include <omp.h>
 #include <raft/core/logger.hpp>
 #include <random>
 #include <thread>
 
 namespace cuvs::neighbors::hnsw::detail {
 
-// Multithreaded executor
-// The helper function is copied from the hnswlib repository
-// as for some reason, adding vectors to the hnswlib index does not
-// work well with omp parallel for
-template <class Function>
-inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn)
-{
-  if (numThreads <= 0) { numThreads = std::thread::hardware_concurrency(); }
-
-  if (numThreads == 1) {
-    for (size_t id = start; id < end; id++) {
-      fn(id, 0);
-    }
-  } else {
-    std::vector<std::thread> threads;
-    std::atomic<size_t> current(start);
-
-    // keep track of exceptions in threads
-    // https://stackoverflow.com/a/32428427/1713196
-    std::exception_ptr lastException = nullptr;
-    std::mutex lastExceptMutex;
-
-    for (size_t threadId = 0; threadId < numThreads; ++threadId) {
-      threads.push_back(std::thread([&, threadId] {
-        while (true) {
-          size_t id = current.fetch_add(1);
-
-          if (id >= end) { break; }
-
-          try {
-            fn(id, threadId);
-          } catch (...) {
-            std::unique_lock<std::mutex> lastExcepLock(lastExceptMutex);
-            lastException = std::current_exception();
-            /*
-             * This will work even when current is the largest value that
-             * size_t can fit, because fetch_add returns the previous value
-             * before the increment (what will result in overflow
-             * and produce 0 instead of current + 1).
-             */
-            current = end;
-            break;
-          }
-        }
-      }));
-    }
-    for (auto& thread : threads) {
-      thread.join();
-    }
-    if (lastException) { std::rethrow_exception(lastException); }
-  }
-}
-
 template <typename T>
 struct hnsw_dist_t {
   using type = void;
@@ -223,10 +171,12 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
     cagra_index.graph().extent(1) / 2,
     params.ef_construction);
   appr_algo->base_layer_init = false;  // tell hnswlib to build upper layers only
-  ParallelFor(0, host_dataset_view.extent(0), params.num_threads, [&](size_t i, size_t threadId) {
+  auto num_threads           = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
+#pragma omp parallel for num_threads(num_threads)
+  for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
     appr_algo->addPoint((void*)(host_dataset_view.data_handle() + i * host_dataset_view.extent(1)),
                         i);
-  });
+  }
   appr_algo->base_layer_init = true;  // reset to true to allow addition of new points
 
   // move cagra graph to host
@@ -242,11 +192,13 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
 // copy cagra graph to hnswlib base layer
 #pragma omp parallel for
   for (size_t i = 0; i < static_cast<size_t>(host_graph.extent(0)); ++i) {
-    auto ll_i = appr_algo->get_linklist0(i);
+    auto hnsw_internal_id = appr_algo->label_lookup_.find(i)->second;
+    auto ll_i             = appr_algo->get_linklist0(hnsw_internal_id);
     appr_algo->setListCount(ll_i, host_graph.extent(1));
     auto* data = (uint32_t*)(ll_i + 1);
     for (size_t j = 0; j < static_cast<size_t>(host_graph.extent(1)); ++j) {
-      data[j] = host_graph(i, j);
+      auto neighbor_internal_id = appr_algo->label_lookup_.find(host_graph(i, j))->second;
+      data[j]                   = neighbor_internal_id;
     }
   }
 
@@ -281,19 +233,15 @@ void extend(raft::resources const& res,
     const_cast<void*>(idx.get_index()));
   auto current_element_count = hnswlib_index->getCurrentElementCount();
   auto new_element_count     = additional_dataset.extent(0);
-  auto num_threads           = params.num_threads == 0 ? std::thread::hardware_concurrency()
-                                                       : static_cast<size_t>(params.num_threads);
+  auto num_threads           = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
 
   hnswlib_index->resizeIndex(current_element_count + new_element_count);
-  ParallelFor(current_element_count,
-              current_element_count + new_element_count,
-              num_threads,
-              [&](size_t i, size_t threadId) {
-                hnswlib_index->addPoint(
-                  (void*)(additional_dataset.data_handle() +
-                          (i - current_element_count) * additional_dataset.extent(1)),
-                  i);
-              });
+#pragma omp parallel for num_threads(num_threads)
+  for (int64_t i = 0; i < additional_dataset.extent(0); i++) {
+    hnswlib_index->addPoint(
+      (void*)(additional_dataset.data_handle() + i * additional_dataset.extent(1)),
+      current_element_count + i);
+  }
 }
 
 template <typename T>
diff --git a/cpp/src/neighbors/hnsw_c.cpp b/cpp/src/neighbors/hnsw_c.cpp
index 0233a510a..628d87e00 100644
--- a/cpp/src/neighbors/hnsw_c.cpp
+++ b/cpp/src/neighbors/hnsw_c.cpp
@@ -123,7 +123,7 @@ extern "C" cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
 {
   return cuvs::core::translate_exceptions([=] {
     *params = new cuvsHnswIndexParams{
-      .hierarchy = cuvsHnswHierarchy::NONE, .ef_construction = 200, .num_threads = 2};
+      .hierarchy = cuvsHnswHierarchy::NONE, .ef_construction = 200, .num_threads = 0};
   });
 }
 
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
index 4c44350e8..72a3617bd 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -52,12 +52,10 @@ cdef class IndexParams:
     ef_construction : int, default = 200 (optional)
         Maximum number of candidate list size used during construction
         when hierarchy is `cpu`.
-    num_threads : int, default = 2 (optional)
+    num_threads : int, default = 0 (optional)
         Number of CPU threads used to increase construction parallelism
-        when hierarchy is `cpu`.
-        NOTE: Constructing the hierarchy when converting from a CAGRA graph
-        is highly sensitive to parallelism, and increasing the number of
-        threads can reduce the quality of the index.
+        when hierarchy is `cpu`. When the value is 0, the number of threads is
+        automatically determined to the maximum number of threads available.
     """
 
     cdef cuvsHnswIndexParams* params
@@ -71,7 +69,7 @@ cdef class IndexParams:
     def __init__(self, *,
                  hierarchy="none",
                  ef_construction=200,
-                 num_threads=2):
+                 num_threads=0):
         if hierarchy == "none":
             self.params.hierarchy = cuvsHnswHierarchy.NONE
         elif hierarchy == "cpu":
diff --git a/python/cuvs/cuvs/tests/test_hnsw.py b/python/cuvs/cuvs/tests/test_hnsw.py
index e00b88bb7..23a0920ef 100644
--- a/python/cuvs/cuvs/tests/test_hnsw.py
+++ b/python/cuvs/cuvs/tests/test_hnsw.py
@@ -54,7 +54,7 @@ def run_hnsw_build_search_test(
 
     assert index.trained
 
-    hnsw_params = hnsw.IndexParams(hierarchy=hierarchy, num_threads=1)
+    hnsw_params = hnsw.IndexParams(hierarchy=hierarchy)
     hnsw_index = hnsw.from_cagra(hnsw_params, index)
 
     queries = generate_data((n_queries, n_cols), dtype)
@@ -135,7 +135,7 @@ def run_hnsw_extend_test(
 
     assert index.trained
 
-    hnsw_params = hnsw.IndexParams(hierarchy="cpu", num_threads=1)
+    hnsw_params = hnsw.IndexParams(hierarchy="cpu")
     hnsw_index = hnsw.from_cagra(hnsw_params, index)
     hnsw.extend(hnsw.ExtendParams(), hnsw_index, add_dataset)
 
@@ -158,7 +158,6 @@ def run_hnsw_extend_test(
     skl_dist, skl_idx = nn_skl.kneighbors(queries, return_distance=True)
 
     recall = calc_recall(out_idx, skl_idx)
-    print(recall)
     assert recall > 0.95
 
 
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
index 90a561bca..630dc94ff 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
@@ -1,5 +1,6 @@
 name: cuvs_cagra_hnswlib
 constraints:
+  build: cuvs_bench.config.algos.constraints.cuvs_cagra_build
   search: cuvs_bench.config.algos.constraints.hnswlib_search
 groups:
   base:
@@ -9,6 +10,5 @@ groups:
       graph_build_algo: ["NN_DESCENT"]
       hierarchy: ["none", "cpu"]
       ef_construction: [64, 128, 256, 512]
-      num_threads: [2, 5, 10]
     search:
       ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800]

From 833f28c4d630d8c9057b1179440909501dab8022 Mon Sep 17 00:00:00 2001
From: Ben Karsin <bkarsin@nvidia.com>
Date: Thu, 30 Jan 2025 12:56:27 -1000
Subject: [PATCH 2/4] Vamana build improvement and added docs (#558)

Includes several fixes and improvements to Vamana, primarily:
- Edge case and bug fixes for Vamana index build (details below)
- Documentation added for Vamana
- experimental namespace removed
- Reduce device memory usage by splitting reverse edge work into batches

The edge case fix adds padding to all shared memory size and offset calculations so any dataset dimension is supported (tests added that verify this). A bug was also fixed with the L2 distance metric causing incorrect results in some rare cases.

This PR addresses the most pressing items in #393 and stabilize the index construction sufficiently to remove the experimental namespace.

Authors:
  - Ben Karsin (https://github.com/bkarsin)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/cuvs/pull/558
---
 cpp/include/cuvs/neighbors/vamana.hpp         | 299 ++++++++++++++++--
 .../neighbors/detail/vamana/greedy_search.cuh |   8 +-
 cpp/src/neighbors/detail/vamana/macros.cuh    |   4 +-
 .../detail/vamana/priority_queue.cuh          |   4 +-
 .../neighbors/detail/vamana/robust_prune.cuh  |  14 +-
 .../neighbors/detail/vamana/vamana_build.cuh  | 197 +++++++-----
 .../detail/vamana/vamana_serialize.cuh        |   4 +-
 .../detail/vamana/vamana_structs.cuh          |  25 +-
 cpp/src/neighbors/vamana.cuh                  |   9 +-
 cpp/src/neighbors/vamana_build_float.cu       |  34 +-
 cpp/src/neighbors/vamana_build_int8.cu        |  34 +-
 cpp/src/neighbors/vamana_build_uint8.cu       |  34 +-
 cpp/src/neighbors/vamana_serialize.cuh        |   9 +-
 cpp/src/neighbors/vamana_serialize_float.cu   |   4 +-
 cpp/src/neighbors/vamana_serialize_int8.cu    |   4 +-
 cpp/src/neighbors/vamana_serialize_uint8.cu   |   4 +-
 cpp/tests/neighbors/ann_vamana.cuh            | 116 +++----
 .../ann_vamana/test_float_uint32_t.cu         |   4 +-
 .../ann_vamana/test_int8_t_uint32_t.cu        |   4 +-
 .../ann_vamana/test_uint8_t_uint32_t.cu       |   4 +-
 docs/source/cpp_api/neighbors.rst             |   1 +
 docs/source/cpp_api/neighbors_vamana.rst      |  44 +++
 docs/source/indexes/indexes.rst               |   1 +
 docs/source/indexes/vamana.rst                |  75 +++++
 examples/cpp/src/vamana_example.cu            |  60 ++--
 25 files changed, 703 insertions(+), 293 deletions(-)
 create mode 100644 docs/source/cpp_api/neighbors_vamana.rst
 create mode 100644 docs/source/indexes/vamana.rst

diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp
index bec17937f..bc205a6f4 100644
--- a/cpp/include/cuvs/neighbors/vamana.hpp
+++ b/cpp/include/cuvs/neighbors/vamana.hpp
@@ -31,16 +31,27 @@
 #include <optional>
 #include <variant>
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
+
 /**
  * @defgroup vamana_cpp_index_params Vamana index build parameters
  * @{
  */
 
 /**
- * @brief ANN parameters used by VAMANA to build index
+ * @brief Parameters used to build DiskANN index
  *
+ * `graph_degree`: Maximum degree of graph; correspods to the R parameter of
+ * Vamana algorithm in the literature.
+ * `visited_size`: Maximum number of visited nodes per search during Vamana algorithm.
+ * Loosely corresponds to the L parameter in the literature.
+ * `vamana_iters`: The number of times all vectors are inserted into the graph. If > 1,
+ * all vectors are re-inserted to improve graph quality.
+ * `max_fraction`: The maximum batch size is this fraction of the total dataset size. Larger
+ * gives faster build but lower graph quality.
+ * `alpha`: Used to determine how aggressive the pruning will be.
  */
+
 struct index_params : cuvs::neighbors::index_params {
   /** Maximum degree of output graph corresponds to the R parameter in the original Vamana
    * literature. */
@@ -55,10 +66,12 @@ struct index_params : cuvs::neighbors::index_params {
   /** Maximum fraction of dataset inserted per batch.              *
    * Larger max batch decreases graph quality, but improves speed */
   float max_fraction = 0.06;
-  /** Base of growth rate of batch sies **/
+  /** Base of growth rate of batch sizes **/
   float batch_base = 2;
   /** Size of candidate queue structure - should be (2^x)-1 */
   uint32_t queue_size = 127;
+  /** Max batchsize of reverse edge processing (reduces memory footprint) */
+  uint32_t reverse_batchsize = 1000000;
 };
 
 /**
@@ -215,61 +228,295 @@ struct index : cuvs::neighbors::index {
  * @{
  */
 /**
- * @brief Build the index from the dataset for efficient search.
+ * @brief Build the index from the dataset for efficient DiskANN search.
+ *
+ * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm
+ * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves
+ * performing a greedy search for each vector to be inserted, and inserting it with edges to
+ * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied
+ * to improve graph quality. The index_params struct controls the degree of the final graph.
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters;
+ *   vamana::index_params index_params;
+ *   // create and fill index from a [N, D] dataset;
+ *   auto index = vamana::build(res, index_params, dataset);
+ *   // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support
+ * search) vamana::serialize(res, filename, index);
+ * @endcode
  *
+ * @param[in] res
+ * @param[in] params parameters for building the index
+ * @param[in] dataset a matrix view (device) to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed vamana index
  */
-auto build(raft::resources const& handle,
-           const cuvs::neighbors::experimental::vamana::index_params& params,
+auto build(raft::resources const& res,
+           const cuvs::neighbors::vamana::index_params& params,
            raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::experimental::vamana::index<float, uint32_t>;
+  -> cuvs::neighbors::vamana::index<float, uint32_t>;
 
-auto build(raft::resources const& handle,
-           const cuvs::neighbors::experimental::vamana::index_params& params,
+/**
+ * @brief Build the index from the dataset for efficient DiskANN search.
+ *
+ * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm
+ * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves
+ * performing a greedy search for each vector to be inserted, and inserting it with edges to
+ * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied
+ * to improve graph quality. The index_params struct controls the degree of the final graph.
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters;
+ *   vamana::index_params index_params;
+ *   // create and fill index from a [N, D] dataset;
+ *   auto index = vamana::build(res, index_params, dataset);
+ *   // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support
+ * search) vamana::serialize(res, filename, index);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] params parameters for building the index
+ * @param[in] dataset a matrix view (host) to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed vamana index
+ */
+auto build(raft::resources const& res,
+           const cuvs::neighbors::vamana::index_params& params,
            raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::experimental::vamana::index<float, uint32_t>;
+  -> cuvs::neighbors::vamana::index<float, uint32_t>;
 
-auto build(raft::resources const& handle,
-           const cuvs::neighbors::experimental::vamana::index_params& params,
+/**
+ * @brief Build the index from the dataset for efficient DiskANN search.
+ *
+ * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm
+ * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves
+ * performing a greedy search for each vector to be inserted, and inserting it with edges to
+ * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied
+ * to improve graph quality. The index_params struct controls the degree of the final graph.
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters;
+ *   vamana::index_params index_params;
+ *   // create and fill index from a [N, D] dataset;
+ *   auto index = vamana::build(res, index_params, dataset);
+ *   // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support
+ * search) vamana::serialize(res, filename, index);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] params parameters for building the index
+ * @param[in] dataset a matrix view (device) to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed vamana index
+ */
+auto build(raft::resources const& res,
+           const cuvs::neighbors::vamana::index_params& params,
            raft::device_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::experimental::vamana::index<int8_t, uint32_t>;
+  -> cuvs::neighbors::vamana::index<int8_t, uint32_t>;
 
-auto build(raft::resources const& handle,
-           const cuvs::neighbors::experimental::vamana::index_params& params,
+/**
+ * @brief Build the index from the dataset for efficient DiskANN search.
+ *
+ * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm
+ * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves
+ * performing a greedy search for each vector to be inserted, and inserting it with edges to
+ * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied
+ * to improve graph quality. The index_params struct controls the degree of the final graph.
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters;
+ *   vamana::index_params index_params;
+ *   // create and fill index from a [N, D] dataset;
+ *   auto index = vamana::build(res, index_params, dataset);
+ *   // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support
+ * search) vamana::serialize(res, filename, index);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] params parameters for building the index
+ * @param[in] dataset a matrix view (host) to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed vamana index
+ */
+auto build(raft::resources const& res,
+           const cuvs::neighbors::vamana::index_params& params,
            raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::experimental::vamana::index<int8_t, uint32_t>;
+  -> cuvs::neighbors::vamana::index<int8_t, uint32_t>;
 
-auto build(raft::resources const& handle,
-           const cuvs::neighbors::experimental::vamana::index_params& params,
+/**
+ * @brief Build the index from the dataset for efficient DiskANN search.
+ *
+ * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm
+ * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves
+ * performing a greedy search for each vector to be inserted, and inserting it with edges to
+ * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied
+ * to improve graph quality. The index_params struct controls the degree of the final graph.
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters;
+ *   vamana::index_params index_params;
+ *   // create and fill index from a [N, D] dataset;
+ *   auto index = vamana::build(res, index_params, dataset);
+ *   // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support
+ * search) vamana::serialize(res, filename, index);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] params parameters for building the index
+ * @param[in] dataset a matrix view (device) to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed vamana index
+ */
+auto build(raft::resources const& res,
+           const cuvs::neighbors::vamana::index_params& params,
            raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::experimental::vamana::index<uint8_t, uint32_t>;
+  -> cuvs::neighbors::vamana::index<uint8_t, uint32_t>;
 
-auto build(raft::resources const& handle,
-           const cuvs::neighbors::experimental::vamana::index_params& params,
+/**
+ * @brief Build the index from the dataset for efficient DiskANN search.
+ *
+ * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm
+ * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves
+ * performing a greedy search for each vector to be inserted, and inserting it with edges to
+ * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied
+ * to improve graph quality. The index_params struct controls the degree of the final graph.
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters;
+ *   vamana::index_params index_params;
+ *   // create and fill index from a [N, D] dataset;
+ *   auto index = vamana::build(res, index_params, dataset);
+ *   // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support
+ * search) vamana::serialize(res, filename, index);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] params parameters for building the index
+ * @param[in] dataset a matrix view (host) to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed vamana index
+ */
+auto build(raft::resources const& res,
+           const cuvs::neighbors::vamana::index_params& params,
            raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::experimental::vamana::index<uint8_t, uint32_t>;
+  -> cuvs::neighbors::vamana::index<uint8_t, uint32_t>;
 
 /**
  * @defgroup vamana_cpp_serialize Vamana serialize functions
  * @{
  */
+
 /**
  * Save the index to file.
+ *
+ * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/neighbors/vamana.hpp>
+ *
+ *   raft::resources handle;
+ *
+ *   // create a string with a filepath
+ *   std::string file_prefix("/path/to/index/prefix");
+ *   // create an index with `auto index = cuvs::neighbors::vamana::build(...);`
+ *   cuvs::neighbors::vamana::serialize(handle, file_prefix, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] file_prefix prefix of path and name of index files
+ * @param[in] index Vamana index
+ *
  */
 
 void serialize(raft::resources const& handle,
                const std::string& file_prefix,
-               const cuvs::neighbors::experimental::vamana::index<float, uint32_t>& index);
+               const cuvs::neighbors::vamana::index<float, uint32_t>& index);
 
+/**
+ * Save the index to file.
+ *
+ * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/neighbors/vamana.hpp>
+ *
+ *   raft::resources handle;
+ *
+ *   // create a string with a filepath
+ *   std::string file_prefix("/path/to/index/prefix");
+ *   // create an index with `auto index = cuvs::neighbors::vamana::build(...);`
+ *   cuvs::neighbors::vamana::serialize(handle, file_prefix, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] file_prefix prefix of path and name of index files
+ * @param[in] index Vamana index
+ *
+ */
 void serialize(raft::resources const& handle,
                const std::string& file_prefix,
-               const cuvs::neighbors::experimental::vamana::index<int8_t, uint32_t>& index);
+               const cuvs::neighbors::vamana::index<int8_t, uint32_t>& index);
 
+/**
+ * Save the index to file.
+ *
+ * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/neighbors/vamana.hpp>
+ *
+ *   raft::resources handle;
+ *
+ *   // create a string with a filepath
+ *   std::string file_prefix("/path/to/index/prefix");
+ *   // create an index with `auto index = cuvs::neighbors::vamana::build(...);`
+ *   cuvs::neighbors::vamana::serialize(handle, file_prefix, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] file_prefix prefix of path and name of index files
+ * @param[in] index Vamana index
+ *
+ */
 void serialize(raft::resources const& handle,
                const std::string& file_prefix,
-               const cuvs::neighbors::experimental::vamana::index<uint8_t, uint32_t>& index);
+               const cuvs::neighbors::vamana::index<uint8_t, uint32_t>& index);
 
 /**
  * @}
  */
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/src/neighbors/detail/vamana/greedy_search.cuh b/cpp/src/neighbors/detail/vamana/greedy_search.cuh
index f51c6c91b..4d94bbaa7 100644
--- a/cpp/src/neighbors/detail/vamana/greedy_search.cuh
+++ b/cpp/src/neighbors/detail/vamana/greedy_search.cuh
@@ -30,7 +30,7 @@
 #include <cstdio>
 #include <vector>
 
-namespace cuvs::neighbors::experimental::vamana::detail {
+namespace cuvs::neighbors::vamana::detail {
 
 /* @defgroup greedy_search_detail greedy search
  * @{
@@ -112,13 +112,15 @@ __global__ void GreedySearchKernel(
     DistPair<IdxT, accT> candidate_queue;
   };
 
+  int align_padding = (((dim - 1) / alignof(ShmemLayout)) + 1) * alignof(ShmemLayout) - dim;
+
   // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list
   extern __shared__ __align__(alignof(ShmemLayout)) char smem[];
 
   size_t smem_offset = sort_smem_size;  // temp sorting memory takes first chunk
 
   T* s_coords = reinterpret_cast<T*>(&smem[smem_offset]);
-  smem_offset += dim * sizeof(T);
+  smem_offset += (dim + align_padding) * sizeof(T);
 
   Node<accT>* topk_pq = reinterpret_cast<Node<accT>*>(&smem[smem_offset]);
   smem_offset += topk * sizeof(Node<accT>);
@@ -283,4 +285,4 @@ __global__ void GreedySearchKernel(
  * @}
  */
 
-}  // namespace cuvs::neighbors::experimental::vamana::detail
+}  // namespace cuvs::neighbors::vamana::detail
diff --git a/cpp/src/neighbors/detail/vamana/macros.cuh b/cpp/src/neighbors/detail/vamana/macros.cuh
index 5692650a0..c290413a2 100644
--- a/cpp/src/neighbors/detail/vamana/macros.cuh
+++ b/cpp/src/neighbors/detail/vamana/macros.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-namespace cuvs::neighbors::experimental::vamana::detail {
+namespace cuvs::neighbors::vamana::detail {
 
 /* Macros to compute the shared memory requirements for CUB primitives used by search and prune */
 #define COMPUTE_SMEM_SIZES(degree, visited_size, DEG, CANDS)                                     \
@@ -79,4 +79,4 @@ namespace cuvs::neighbors::experimental::vamana::detail {
   SEARCH_CALL_SORT(topk, 512);   \
   SEARCH_CALL_SORT(topk, 1024);
 
-}  // namespace cuvs::neighbors::experimental::vamana::detail
+}  // namespace cuvs::neighbors::vamana::detail
diff --git a/cpp/src/neighbors/detail/vamana/priority_queue.cuh b/cpp/src/neighbors/detail/vamana/priority_queue.cuh
index 4b3bd8466..6dc1dc94a 100644
--- a/cpp/src/neighbors/detail/vamana/priority_queue.cuh
+++ b/cpp/src/neighbors/detail/vamana/priority_queue.cuh
@@ -20,7 +20,7 @@
 #include <raft/util/warp_primitives.cuh>
 #include <stdio.h>
 
-namespace cuvs::neighbors::experimental::vamana::detail {
+namespace cuvs::neighbors::vamana::detail {
 
 /***************************************************************************************
 ***************************************************************************************/
@@ -326,4 +326,4 @@ __forceinline__ __device__ void enqueue_all_neighbors(int num_neighbors,
   }
 }
 
-}  // namespace cuvs::neighbors::experimental::vamana::detail
+}  // namespace cuvs::neighbors::vamana::detail
diff --git a/cpp/src/neighbors/detail/vamana/robust_prune.cuh b/cpp/src/neighbors/detail/vamana/robust_prune.cuh
index 8446ac136..182d20c88 100644
--- a/cpp/src/neighbors/detail/vamana/robust_prune.cuh
+++ b/cpp/src/neighbors/detail/vamana/robust_prune.cuh
@@ -19,10 +19,12 @@
 #include <cub/cub.cuh>
 #include <thrust/sort.h>
 
+#include <raft/util/cuda_dev_essentials.cuh>
+
 #include "macros.cuh"
 #include "vamana_structs.cuh"
 
-namespace cuvs::neighbors::experimental::vamana::detail {
+namespace cuvs::neighbors::vamana::detail {
 
 // Load candidates (from query) and previous edges (from nbh_list) into registers (tmp) spanning
 // warp
@@ -145,9 +147,11 @@ __global__ void RobustPruneKernel(
   // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list
   extern __shared__ __align__(alignof(ShmemLayout)) char smem[];
 
-  T* s_coords = reinterpret_cast<T*>(&smem[sort_smem_size]);
-  DistPair<IdxT, accT>* new_nbh_list =
-    reinterpret_cast<DistPair<IdxT, accT>*>(&smem[dim * sizeof(T) + sort_smem_size]);
+  int align_padding = raft::alignTo<int>(dim, alignof(ShmemLayout)) - dim;
+
+  T* s_coords                        = reinterpret_cast<T*>(&smem[sort_smem_size]);
+  DistPair<IdxT, accT>* new_nbh_list = reinterpret_cast<DistPair<IdxT, accT>*>(
+    &smem[(dim + align_padding) * sizeof(T) + sort_smem_size]);
 
   static __shared__ Point<T, accT> s_query;
   s_query.coords = s_coords;
@@ -245,4 +249,4 @@ __global__ void RobustPruneKernel(
 
 }  // namespace
 
-}  // namespace cuvs::neighbors::experimental::vamana::detail
+}  // namespace cuvs::neighbors::vamana::detail
diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh
index ec75c99c1..184b024f8 100644
--- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh
+++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh
@@ -46,7 +46,7 @@
 #include <cstdio>
 #include <vector>
 
-namespace cuvs::neighbors::experimental::vamana::detail {
+namespace cuvs::neighbors::vamana::detail {
 
 /* @defgroup vamana_build_detail vamana build
  * @{
@@ -104,11 +104,12 @@ void batched_insert_vamana(
       "to 1.0");
     max_batchsize = (int)dataset.extent(0);
   }
-  int insert_iters = (int)(params.vamana_iters);
-  double base      = (double)(params.batch_base);
-  float alpha      = (float)(params.alpha);
-  int visited_size = params.visited_size;
-  int queue_size   = params.queue_size;
+  int insert_iters  = (int)(params.vamana_iters);
+  double base       = (double)(params.batch_base);
+  float alpha       = (float)(params.alpha);
+  int visited_size  = params.visited_size;
+  int queue_size    = params.queue_size;
+  int reverse_batch = params.reverse_batchsize;
 
   if ((visited_size & (visited_size - 1)) != 0) {
     RAFT_LOG_WARN("visited_size must be a power of 2, rounding up.");
@@ -152,36 +153,20 @@ void batched_insert_vamana(
   std::vector<IdxT> insert_order;
   create_insert_permutation<IdxT>(insert_order, (uint32_t)N);
 
-  // Memory needed to sort reverse edges - potentially large memory footprint
-  auto edge_dest =
-    raft::make_device_mdarray<IdxT>(res,
-                                    raft::resource::get_large_workspace_resource(res),
-                                    raft::make_extents<int64_t>(max_batchsize, degree));
-  auto edge_src =
-    raft::make_device_mdarray<IdxT>(res,
-                                    raft::resource::get_large_workspace_resource(res),
-                                    raft::make_extents<int64_t>(max_batchsize, degree));
-
-  size_t temp_storage_bytes = max_batchsize * degree * (2 * sizeof(IdxT));
-  RAFT_LOG_DEBUG("Temp storage needed for sorting (bytes): %lu", temp_storage_bytes);
-  auto temp_sort_storage =
-    raft::make_device_mdarray<IdxT>(res,
-                                    raft::resource::get_large_workspace_resource(res),
-                                    raft::make_extents<int64_t>(2 * max_batchsize, degree));
-
   // Calculate the shared memory sizes of each kernel
   int search_smem_sort_size = 0;
   int prune_smem_sort_size  = 0;
   SELECT_SMEM_SIZES(degree, visited_size);  // Sets above 2 variables to appropriate sizes
 
   // Total dynamic shared memory used by GreedySearch
-  int search_smem_total_size =
-    static_cast<int>(search_smem_sort_size + dim * sizeof(T) + visited_size * sizeof(Node<accT>) +
-                     degree * sizeof(int) + queue_size * sizeof(DistPair<IdxT, accT>));
+  int align_padding          = raft::alignTo(dim, 16) - dim;
+  int search_smem_total_size = static_cast<int>(
+    search_smem_sort_size + (dim + align_padding) * sizeof(T) + visited_size * sizeof(Node<accT>) +
+    degree * sizeof(int) + queue_size * sizeof(DistPair<IdxT, accT>));
 
   // Total dynamic shared memory size needed by both RobustPrune calls
-  int prune_smem_total_size =
-    prune_smem_sort_size + dim * sizeof(T) + (degree + visited_size) * sizeof(DistPair<IdxT, accT>);
+  int prune_smem_total_size = prune_smem_sort_size + (dim + align_padding) * sizeof(T) +
+                              (degree + visited_size) * sizeof(DistPair<IdxT, accT>);
 
   RAFT_LOG_DEBUG("Dynamic shared memory usage (bytes): GreedySearch: %d, RobustPrune: %d",
                  search_smem_total_size,
@@ -228,7 +213,6 @@ void batched_insert_vamana(
                                                                  metric,
                                                                  queue_size,
                                                                  search_smem_sort_size);
-
       // Run on candidates of vectors being inserted
       RobustPruneKernel<T, accT, IdxT>
         <<<num_blocks, blockD, prune_smem_total_size, stream>>>(d_graph.view(),
@@ -252,6 +236,16 @@ void batched_insert_vamana(
 
       int total_edges;
       raft::copy(&total_edges, d_total_edges.data_handle(), 1, stream);
+      RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+      auto edge_dest =
+        raft::make_device_mdarray<IdxT>(res,
+                                        raft::resource::get_large_workspace_resource(res),
+                                        raft::make_extents<int64_t>(total_edges));
+      auto edge_src =
+        raft::make_device_mdarray<IdxT>(res,
+                                        raft::resource::get_large_workspace_resource(res),
+                                        raft::make_extents<int64_t>(total_edges));
 
       // Create reverse edge list
       create_reverse_edge_list<accT, IdxT>
@@ -261,6 +255,24 @@ void batched_insert_vamana(
                                             edge_src.data_handle(),
                                             edge_dest.data_handle());
 
+      void* d_temp_storage      = nullptr;
+      size_t temp_storage_bytes = 0;
+
+      cub::DeviceMergeSort::SortPairs(d_temp_storage,
+                                      temp_storage_bytes,
+                                      edge_dest.data_handle(),
+                                      edge_src.data_handle(),
+                                      total_edges,
+                                      CmpEdge<IdxT>(),
+                                      stream);
+
+      RAFT_LOG_DEBUG("Temp storage needed for sorting (bytes): %lu", temp_storage_bytes);
+
+      auto temp_sort_storage = raft::make_device_mdarray<IdxT>(
+        res,
+        raft::resource::get_large_workspace_resource(res),
+        raft::make_extents<int64_t>(temp_storage_bytes / sizeof(IdxT)));
+
       // Sort to group reverse edges by destination
       cub::DeviceMergeSort::SortPairs(temp_sort_storage.data_handle(),
                                       temp_storage_bytes,
@@ -279,64 +291,77 @@ void batched_insert_vamana(
                                                 edge_dest.data_handle() + total_edges);
       auto unique_indices = raft::make_device_vector<int>(res, total_edges);
       raft::linalg::map_offset(res, unique_indices.view(), raft::identity_op{});
+
       thrust::unique_by_key(
         edge_dest_vec.begin(), edge_dest_vec.end(), unique_indices.data_handle());
 
-      // Allocate reverse QueryCandidate list based on number of unique destinations
-      // TODO - Do this in batches to reduce memory footprint / support larger datasets
-      auto reverse_list_ptr = raft::make_device_mdarray<QueryCandidates<IdxT, accT>>(
-        res,
-        raft::resource::get_large_workspace_resource(res),
-        raft::make_extents<int64_t>(unique_dests));
-      auto rev_ids =
-        raft::make_device_mdarray<IdxT>(res,
-                                        raft::resource::get_large_workspace_resource(res),
-                                        raft::make_extents<int64_t>(unique_dests, visited_size));
-      auto rev_dists =
-        raft::make_device_mdarray<accT>(res,
-                                        raft::resource::get_large_workspace_resource(res),
-                                        raft::make_extents<int64_t>(unique_dests, visited_size));
-
-      QueryCandidates<IdxT, accT>* reverse_list =
-        static_cast<QueryCandidates<IdxT, accT>*>(reverse_list_ptr.data_handle());
-
-      init_query_candidate_list<IdxT, accT><<<256, blockD, 0, stream>>>(reverse_list,
-                                                                        rev_ids.data_handle(),
-                                                                        rev_dists.data_handle(),
-                                                                        (int)unique_dests,
-                                                                        visited_size);
-
-      // May need more blocks for reverse list
-      num_blocks = min(maxBlocks, unique_dests);
-
-      // Populate reverse list ids and candidate lists from edge_src and edge_dest
-      populate_reverse_list_struct<T, accT, IdxT>
-        <<<num_blocks, blockD, 0, stream>>>(reverse_list,
-                                            edge_src.data_handle(),
-                                            edge_dest.data_handle(),
-                                            unique_indices.data_handle(),
-                                            unique_dests,
-                                            total_edges,
-                                            dataset.extent(0));
-
-      // Recompute distances (avoided keeping it during sorting)
-      recompute_reverse_dists<T, accT, IdxT>
-        <<<num_blocks, blockD, 0, stream>>>(reverse_list, dataset, unique_dests, metric);
-
-      // Call 2nd RobustPrune on reverse query_list
-      RobustPruneKernel<T, accT, IdxT>
-        <<<num_blocks, blockD, prune_smem_total_size, stream>>>(d_graph.view(),
-                                                                raft::make_const_mdspan(dataset),
-                                                                reverse_list_ptr.data_handle(),
-                                                                unique_dests,
-                                                                visited_size,
-                                                                metric,
-                                                                alpha,
-                                                                prune_smem_sort_size);
-
-      // Write new edge lists to graph
-      write_graph_edges_kernel<accT, IdxT><<<num_blocks, blockD, 0, stream>>>(
-        d_graph.view(), reverse_list_ptr.data_handle(), degree, unique_dests);
+      edge_dest_vec.clear();
+      edge_dest_vec.shrink_to_fit();
+
+      // Batch execution of reverse edge creation/application
+      reverse_batch = params.reverse_batchsize;
+      for (int rev_start = 0; rev_start < (int)unique_dests; rev_start += reverse_batch) {
+        if (rev_start + reverse_batch > (int)unique_dests) {
+          reverse_batch = (int)unique_dests - rev_start;
+        }
+
+        // Allocate reverse QueryCandidate list based on number of unique destinations
+        auto reverse_list_ptr = raft::make_device_mdarray<QueryCandidates<IdxT, accT>>(
+          res,
+          raft::resource::get_large_workspace_resource(res),
+          raft::make_extents<int64_t>(reverse_batch));
+        auto rev_ids =
+          raft::make_device_mdarray<IdxT>(res,
+                                          raft::resource::get_large_workspace_resource(res),
+                                          raft::make_extents<int64_t>(reverse_batch, visited_size));
+        auto rev_dists =
+          raft::make_device_mdarray<accT>(res,
+                                          raft::resource::get_large_workspace_resource(res),
+                                          raft::make_extents<int64_t>(reverse_batch, visited_size));
+
+        QueryCandidates<IdxT, accT>* reverse_list =
+          static_cast<QueryCandidates<IdxT, accT>*>(reverse_list_ptr.data_handle());
+
+        init_query_candidate_list<IdxT, accT><<<256, blockD, 0, stream>>>(reverse_list,
+                                                                          rev_ids.data_handle(),
+                                                                          rev_dists.data_handle(),
+                                                                          (int)reverse_batch,
+                                                                          visited_size);
+
+        // May need more blocks for reverse list
+        num_blocks = min(maxBlocks, reverse_batch);
+
+        // Populate reverse list ids and candidate lists from edge_src and edge_dest
+        populate_reverse_list_struct<T, accT, IdxT>
+          <<<num_blocks, blockD, 0, stream>>>(reverse_list,
+                                              edge_src.data_handle(),
+                                              edge_dest.data_handle(),
+                                              unique_indices.data_handle(),
+                                              unique_dests,
+                                              total_edges,
+                                              dataset.extent(0),
+                                              rev_start,
+                                              reverse_batch);
+
+        // Recompute distances (avoided keeping it during sorting)
+        recompute_reverse_dists<T, accT, IdxT>
+          <<<num_blocks, blockD, 0, stream>>>(reverse_list, dataset, reverse_batch, metric);
+
+        // Call 2nd RobustPrune on reverse query_list
+        RobustPruneKernel<T, accT, IdxT>
+          <<<num_blocks, blockD, prune_smem_total_size, stream>>>(d_graph.view(),
+                                                                  raft::make_const_mdspan(dataset),
+                                                                  reverse_list_ptr.data_handle(),
+                                                                  reverse_batch,
+                                                                  visited_size,
+                                                                  metric,
+                                                                  alpha,
+                                                                  prune_smem_sort_size);
+
+        // Write new edge lists to graph
+        write_graph_edges_kernel<accT, IdxT><<<num_blocks, blockD, 0, stream>>>(
+          d_graph.view(), reverse_list_ptr.data_handle(), degree, reverse_batch);
+      }
 
       start += step_size;
       step_size *= base;
@@ -371,8 +396,6 @@ index<T, IdxT> build(
   RAFT_EXPECTS(params.visited_size > graph_degree, "visited_size must be > graph_degree");
 
   int dim = dataset.extent(1);
-  // TODO - Fix issue with alignment when dataset dimension is odd
-  RAFT_EXPECTS(dim % 2 == 0, "Datasets with an odd number of dimensions not currently supported");
 
   RAFT_LOG_DEBUG("Creating empty graph structure");
   auto vamana_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
@@ -405,4 +428,4 @@ index<T, IdxT> build(
  * @}
  */
 
-}  // namespace cuvs::neighbors::experimental::vamana::detail
+}  // namespace cuvs::neighbors::vamana::detail
diff --git a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
index c360ae19a..27a17205e 100644
--- a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
+++ b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
@@ -34,7 +34,7 @@
 #include <fstream>
 #include <type_traits>
 
-namespace cuvs::neighbors::experimental::vamana::detail {
+namespace cuvs::neighbors::vamana::detail {
 
 /**
  * Save the index to file.
@@ -117,4 +117,4 @@ void serialize(raft::resources const& res,
   if (!index_of) { RAFT_FAIL("Error writing output %s", file_name.c_str()); }
 }
 
-}  // namespace cuvs::neighbors::experimental::vamana::detail
+}  // namespace cuvs::neighbors::vamana::detail
diff --git a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh
index f6f0279f7..22678c196 100644
--- a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh
+++ b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh
@@ -34,7 +34,7 @@
 
 #include <cuvs/distance/distance.hpp>
 
-namespace cuvs::neighbors::experimental::vamana::detail {
+namespace cuvs::neighbors::vamana::detail {
 
 /* @defgroup vamana_structures vamana structures
  * @{
@@ -170,7 +170,7 @@ __device__ SUMTYPE l2_ILP4(Point<T, SUMTYPE>* src_vec, Point<T, SUMTYPE>* dst_ve
     temp_dst[0] = dst_vec->coords[i];
     if (i + 32 < src_vec->Dim) temp_dst[1] = dst_vec->coords[i + 32];
     if (i + 64 < src_vec->Dim) temp_dst[2] = dst_vec->coords[i + 64];
-    if (i + 92 < src_vec->Dim) temp_dst[3] = dst_vec->coords[i + 96];
+    if (i + 96 < src_vec->Dim) temp_dst[3] = dst_vec->coords[i + 96];
 
     partial_sum[0] = fmaf(
       (src_vec[0].coords[i] - temp_dst[0]), (src_vec[0].coords[i] - temp_dst[0]), partial_sum[0]);
@@ -182,7 +182,7 @@ __device__ SUMTYPE l2_ILP4(Point<T, SUMTYPE>* src_vec, Point<T, SUMTYPE>* dst_ve
       partial_sum[2] = fmaf((src_vec[0].coords[i + 64] - temp_dst[2]),
                             (src_vec[0].coords[i + 64] - temp_dst[2]),
                             partial_sum[2]);
-    if (i + 92 < src_vec->Dim)
+    if (i + 96 < src_vec->Dim)
       partial_sum[3] = fmaf((src_vec[0].coords[i + 96] - temp_dst[3]),
                             (src_vec[0].coords[i + 96] - temp_dst[3]),
                             partial_sum[3]);
@@ -192,6 +192,7 @@ __device__ SUMTYPE l2_ILP4(Point<T, SUMTYPE>* src_vec, Point<T, SUMTYPE>* dst_ve
   for (int offset = 16; offset > 0; offset /= 2) {
     partial_sum[0] += __shfl_down_sync(FULL_BITMASK, partial_sum[0], offset);
   }
+
   return partial_sum[0];
 }
 
@@ -419,22 +420,24 @@ __global__ void populate_reverse_list_struct(QueryCandidates<IdxT, accT>* revers
                                              int* unique_indices,
                                              int unique_dests,
                                              int total_edges,
-                                             int N)
+                                             int N,
+                                             int rev_start,
+                                             int reverse_batch)
 {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < unique_dests;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < reverse_batch;
        i += blockDim.x * gridDim.x) {
-    reverse_list[i].queryId = edge_dest[unique_indices[i]];
-    if (i == unique_dests - 1) {
-      reverse_list[i].size = total_edges - unique_indices[i];
+    reverse_list[i].queryId = edge_dest[unique_indices[i + rev_start]];
+    if (rev_start + i == unique_dests - 1) {
+      reverse_list[i].size = total_edges - unique_indices[i + rev_start];
     } else {
-      reverse_list[i].size = unique_indices[i + 1] - unique_indices[i];
+      reverse_list[i].size = unique_indices[i + rev_start + 1] - unique_indices[i + rev_start];
     }
     if (reverse_list[i].size > reverse_list[i].maxSize) {
       reverse_list[i].size = reverse_list[i].maxSize;
     }
 
     for (int j = 0; j < reverse_list[i].size; j++) {
-      reverse_list[i].ids[j] = edge_src[unique_indices[i] + j];
+      reverse_list[i].ids[j] = edge_src[unique_indices[i + rev_start] + j];
     }
     for (int j = reverse_list[i].size; j < reverse_list[i].maxSize; j++) {
       reverse_list[i].ids[j]   = raft::upper_bound<IdxT>();
@@ -475,4 +478,4 @@ __global__ void recompute_reverse_dists(
  * @}
  */
 
-}  // namespace cuvs::neighbors::experimental::vamana::detail
+}  // namespace cuvs::neighbors::vamana::detail
diff --git a/cpp/src/neighbors/vamana.cuh b/cpp/src/neighbors/vamana.cuh
index 9b9e8d271..964d7a9a0 100644
--- a/cpp/src/neighbors/vamana.cuh
+++ b/cpp/src/neighbors/vamana.cuh
@@ -31,7 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 /**
  * @defgroup VAMANA ANN Graph-based nearest neighbor search
@@ -85,8 +85,7 @@ index<T, IdxT> build(
   const index_params& params,
   raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
 {
-  return cuvs::neighbors::experimental::vamana::detail::build<T, IdxT, Accessor>(
-    res, params, dataset);
+  return cuvs::neighbors::vamana::detail::build<T, IdxT, Accessor>(res, params, dataset);
 }
 
 template <typename T, typename IdxT>
@@ -94,9 +93,9 @@ void serialize(raft::resources const& res,
                const std::string& file_prefix,
                const index<T, IdxT>& index_)
 {
-  cuvs::neighbors::experimental::vamana::detail::build<T, IdxT>(res, file_prefix, index_);
+  cuvs::neighbors::vamana::detail::build<T, IdxT>(res, file_prefix, index_);
 }
 
 /** @} */  // end group vamana
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/src/neighbors/vamana_build_float.cu b/cpp/src/neighbors/vamana_build_float.cu
index b83af6122..0e09d6399 100644
--- a/cpp/src/neighbors/vamana_build_float.cu
+++ b/cpp/src/neighbors/vamana_build_float.cu
@@ -17,27 +17,27 @@
 #include "vamana.cuh"
 #include <cuvs/neighbors/vamana.hpp>
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
-#define RAFT_INST_VAMANA_BUILD(T, IdxT)                                                    \
-  auto build(raft::resources const& handle,                                                \
-             const cuvs::neighbors::experimental::vamana::index_params& params,            \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)          \
-    ->cuvs::neighbors::experimental::vamana::index<T, IdxT>                                \
-  {                                                                                        \
-    return cuvs::neighbors::experimental::vamana::build<T, IdxT>(handle, params, dataset); \
-  }                                                                                        \
-                                                                                           \
-  auto build(raft::resources const& handle,                                                \
-             const cuvs::neighbors::experimental::vamana::index_params& params,            \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)            \
-    ->cuvs::neighbors::experimental::vamana::index<T, IdxT>                                \
-  {                                                                                        \
-    return cuvs::neighbors::experimental::vamana::build<T, IdxT>(handle, params, dataset); \
+#define RAFT_INST_VAMANA_BUILD(T, IdxT)                                           \
+  auto build(raft::resources const& handle,                                       \
+             const cuvs::neighbors::vamana::index_params& params,                 \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
+    ->cuvs::neighbors::vamana::index<T, IdxT>                                     \
+  {                                                                               \
+    return cuvs::neighbors::vamana::build<T, IdxT>(handle, params, dataset);      \
+  }                                                                               \
+                                                                                  \
+  auto build(raft::resources const& handle,                                       \
+             const cuvs::neighbors::vamana::index_params& params,                 \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
+    ->cuvs::neighbors::vamana::index<T, IdxT>                                     \
+  {                                                                               \
+    return cuvs::neighbors::vamana::build<T, IdxT>(handle, params, dataset);      \
   }
 
 RAFT_INST_VAMANA_BUILD(float, uint32_t);
 
 #undef RAFT_INST_VAMANA_BUILD
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/src/neighbors/vamana_build_int8.cu b/cpp/src/neighbors/vamana_build_int8.cu
index 91d2cf028..f70b9ea27 100644
--- a/cpp/src/neighbors/vamana_build_int8.cu
+++ b/cpp/src/neighbors/vamana_build_int8.cu
@@ -17,27 +17,27 @@
 #include "vamana.cuh"
 #include <cuvs/neighbors/vamana.hpp>
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
-#define RAFT_INST_VAMANA_BUILD(T, IdxT)                                                    \
-  auto build(raft::resources const& handle,                                                \
-             const cuvs::neighbors::experimental::vamana::index_params& params,            \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)          \
-    ->cuvs::neighbors::experimental::vamana::index<T, IdxT>                                \
-  {                                                                                        \
-    return cuvs::neighbors::experimental::vamana::build<T, IdxT>(handle, params, dataset); \
-  }                                                                                        \
-                                                                                           \
-  auto build(raft::resources const& handle,                                                \
-             const cuvs::neighbors::experimental::vamana::index_params& params,            \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)            \
-    ->cuvs::neighbors::experimental::vamana::index<T, IdxT>                                \
-  {                                                                                        \
-    return cuvs::neighbors::experimental::vamana::build<T, IdxT>(handle, params, dataset); \
+#define RAFT_INST_VAMANA_BUILD(T, IdxT)                                           \
+  auto build(raft::resources const& handle,                                       \
+             const cuvs::neighbors::vamana::index_params& params,                 \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
+    ->cuvs::neighbors::vamana::index<T, IdxT>                                     \
+  {                                                                               \
+    return cuvs::neighbors::vamana::build<T, IdxT>(handle, params, dataset);      \
+  }                                                                               \
+                                                                                  \
+  auto build(raft::resources const& handle,                                       \
+             const cuvs::neighbors::vamana::index_params& params,                 \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
+    ->cuvs::neighbors::vamana::index<T, IdxT>                                     \
+  {                                                                               \
+    return cuvs::neighbors::vamana::build<T, IdxT>(handle, params, dataset);      \
   }
 
 RAFT_INST_VAMANA_BUILD(int8_t, uint32_t);
 
 #undef RAFT_INST_VAMANA_BUILD
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/src/neighbors/vamana_build_uint8.cu b/cpp/src/neighbors/vamana_build_uint8.cu
index bba93e7f4..8daf0c065 100644
--- a/cpp/src/neighbors/vamana_build_uint8.cu
+++ b/cpp/src/neighbors/vamana_build_uint8.cu
@@ -17,27 +17,27 @@
 #include "vamana.cuh"
 #include <cuvs/neighbors/vamana.hpp>
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
-#define RAFT_INST_VAMANA_BUILD(T, IdxT)                                                    \
-  auto build(raft::resources const& handle,                                                \
-             const cuvs::neighbors::experimental::vamana::index_params& params,            \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)          \
-    ->cuvs::neighbors::experimental::vamana::index<T, IdxT>                                \
-  {                                                                                        \
-    return cuvs::neighbors::experimental::vamana::build<T, IdxT>(handle, params, dataset); \
-  }                                                                                        \
-                                                                                           \
-  auto build(raft::resources const& handle,                                                \
-             const cuvs::neighbors::experimental::vamana::index_params& params,            \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)            \
-    ->cuvs::neighbors::experimental::vamana::index<T, IdxT>                                \
-  {                                                                                        \
-    return cuvs::neighbors::experimental::vamana::build<T, IdxT>(handle, params, dataset); \
+#define RAFT_INST_VAMANA_BUILD(T, IdxT)                                           \
+  auto build(raft::resources const& handle,                                       \
+             const cuvs::neighbors::vamana::index_params& params,                 \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
+    ->cuvs::neighbors::vamana::index<T, IdxT>                                     \
+  {                                                                               \
+    return cuvs::neighbors::vamana::build<T, IdxT>(handle, params, dataset);      \
+  }                                                                               \
+                                                                                  \
+  auto build(raft::resources const& handle,                                       \
+             const cuvs::neighbors::vamana::index_params& params,                 \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
+    ->cuvs::neighbors::vamana::index<T, IdxT>                                     \
+  {                                                                               \
+    return cuvs::neighbors::vamana::build<T, IdxT>(handle, params, dataset);      \
   }
 
 RAFT_INST_VAMANA_BUILD(uint8_t, uint32_t);
 
 #undef RAFT_INST_VAMANA_BUILD
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/src/neighbors/vamana_serialize.cuh b/cpp/src/neighbors/vamana_serialize.cuh
index a49d267b3..b8cb580a8 100644
--- a/cpp/src/neighbors/vamana_serialize.cuh
+++ b/cpp/src/neighbors/vamana_serialize.cuh
@@ -18,7 +18,7 @@
 
 #include "detail/vamana/vamana_serialize.cuh"
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 /**
  * @defgroup VAMANA graph serialize/derserialize
@@ -28,12 +28,11 @@ namespace cuvs::neighbors::experimental::vamana {
 #define CUVS_INST_VAMANA_SERIALIZE(DTYPE)                                                     \
   void serialize(raft::resources const& handle,                                               \
                  const std::string& file_prefix,                                              \
-                 const cuvs::neighbors::experimental::vamana::index<DTYPE, uint32_t>& index_) \
+                 const cuvs::neighbors::vamana::index<DTYPE, uint32_t>& index_)               \
   {                                                                                           \
-    cuvs::neighbors::experimental::vamana::detail::serialize<DTYPE, uint32_t>(                \
-      handle, file_prefix, index_);                                                           \
+    cuvs::neighbors::vamana::detail::serialize<DTYPE, uint32_t>(handle, file_prefix, index_); \
   };
 
 /** @} */  // end group vamana
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/src/neighbors/vamana_serialize_float.cu b/cpp/src/neighbors/vamana_serialize_float.cu
index f25369368..8bf7ceb1e 100644
--- a/cpp/src/neighbors/vamana_serialize_float.cu
+++ b/cpp/src/neighbors/vamana_serialize_float.cu
@@ -16,8 +16,8 @@
 
 #include "vamana_serialize.cuh"
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 CUVS_INST_VAMANA_SERIALIZE(float);
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/src/neighbors/vamana_serialize_int8.cu b/cpp/src/neighbors/vamana_serialize_int8.cu
index 1cd54b198..0f87f67ce 100644
--- a/cpp/src/neighbors/vamana_serialize_int8.cu
+++ b/cpp/src/neighbors/vamana_serialize_int8.cu
@@ -16,8 +16,8 @@
 
 #include "vamana_serialize.cuh"
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 CUVS_INST_VAMANA_SERIALIZE(int8_t);
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/src/neighbors/vamana_serialize_uint8.cu b/cpp/src/neighbors/vamana_serialize_uint8.cu
index 3e6d945b8..871c30506 100644
--- a/cpp/src/neighbors/vamana_serialize_uint8.cu
+++ b/cpp/src/neighbors/vamana_serialize_uint8.cu
@@ -16,8 +16,8 @@
 
 #include "vamana_serialize.cuh"
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 CUVS_INST_VAMANA_SERIALIZE(uint8_t);
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/tests/neighbors/ann_vamana.cuh b/cpp/tests/neighbors/ann_vamana.cuh
index 9d9df4470..9fe0324d7 100644
--- a/cpp/tests/neighbors/ann_vamana.cuh
+++ b/cpp/tests/neighbors/ann_vamana.cuh
@@ -46,7 +46,7 @@
 #include <string>
 #include <vector>
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 struct edge_op {
   template <typename Type, typename... UnusedArgs>
@@ -64,6 +64,7 @@ struct AnnVamanaInputs {
   double max_fraction;
   cuvs::distance::DistanceType metric;
   bool host_dataset;
+  int reverse_batchsize;
 
   // cagra search params
   int n_queries;
@@ -131,10 +132,11 @@ class AnnVamanaTest : public ::testing::TestWithParam<AnnVamanaInputs> {
   void testVamana()
   {
     vamana::index_params index_params;
-    index_params.metric       = ps.metric;
-    index_params.graph_degree = ps.graph_degree;
-    index_params.visited_size = ps.visited_size;
-    index_params.max_fraction = ps.max_fraction;
+    index_params.metric            = ps.metric;
+    index_params.graph_degree      = ps.graph_degree;
+    index_params.visited_size      = ps.visited_size;
+    index_params.max_fraction      = ps.max_fraction;
+    index_params.reverse_batchsize = ps.reverse_batchsize;
 
     auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
       (const DataT*)database.data(), ps.n_rows, ps.dim);
@@ -264,14 +266,13 @@ inline std::vector<AnnVamanaInputs> generate_inputs()
 {
   std::vector<AnnVamanaInputs> inputs = raft::util::itertools::product<AnnVamanaInputs>(
     {1000},
-    //    {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // TODO - fix alignment
-    //    issue for odd dims
-    {16, 32, 64, 128, 192, 256, 512, 1024},  // dim
-    {32},                                    // graph degree
-    {64, 128, 256},                          // visited_size
+    {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},
+    {32},       // graph degree
+    {64, 256},  // visited_size
     {0.06, 0.1},
     {cuvs::distance::DistanceType::L2Expanded},
     {false},
+    {100, 1000000},
     {100},
     {10},
     {cagra::search_algo::AUTO},
@@ -280,55 +281,58 @@ inline std::vector<AnnVamanaInputs> generate_inputs()
     {1},
     {0.2});
 
-  std::vector<AnnVamanaInputs> inputs2 =
-    raft::util::itertools::product<AnnVamanaInputs>({1000},
-                                                    {16, 32, 64, 128, 192, 256, 512, 1024},  // dim
-                                                    {64},             // graph degree
-                                                    {128, 256, 512},  // visited_size
-                                                    {0.06, 0.1},
-                                                    {cuvs::distance::DistanceType::L2Expanded},
-                                                    {false},
-                                                    {100},
-                                                    {10},
-                                                    {cagra::search_algo::AUTO},
-                                                    {10},
-                                                    {32},
-                                                    {1},
-                                                    {0.2});
+  std::vector<AnnVamanaInputs> inputs2 = raft::util::itertools::product<AnnVamanaInputs>(
+    {1000},
+    {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},
+    {64},        // graph degree
+    {128, 512},  // visited_size
+    {0.06},
+    {cuvs::distance::DistanceType::L2Expanded},
+    {false},
+    {1000000},
+    {100},
+    {10},
+    {cagra::search_algo::AUTO},
+    {10},
+    {32},
+    {1},
+    {0.2});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnVamanaInputs>({1000},
-                                                    {16, 32, 64, 128, 192, 256, 512, 1024},  // dim
-                                                    {128},       // graph degree
-                                                    {256, 512},  // visited_size
-                                                    {0.06, 0.1},
-                                                    {cuvs::distance::DistanceType::L2Expanded},
-                                                    {false},
-                                                    {100},
-                                                    {10},
-                                                    {cagra::search_algo::AUTO},
-                                                    {10},
-                                                    {64},
-                                                    {1},
-                                                    {0.2});
+  inputs2 = raft::util::itertools::product<AnnVamanaInputs>(
+    {1000},
+    {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},
+    {128},  // graph degree
+    {256},  // visited_size
+    {0.06},
+    {cuvs::distance::DistanceType::L2Expanded},
+    {false},
+    {1000000},
+    {100},
+    {10},
+    {cagra::search_algo::AUTO},
+    {10},
+    {64},
+    {1},
+    {0.2});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnVamanaInputs>({1000},
-                                                    {16, 32, 64, 128, 192, 256, 512, 1024},  // dim
-                                                    {256},        // graph degree
-                                                    {512, 1024},  // visited_size
-                                                    {0.06, 0.1},
-                                                    {cuvs::distance::DistanceType::L2Expanded},
-                                                    {false},
-                                                    {100},
-                                                    {10},
-                                                    {cagra::search_algo::AUTO},
-                                                    {10},
-                                                    {64},
-                                                    {1},
-                                                    {0.2});
+  inputs2 = raft::util::itertools::product<AnnVamanaInputs>(
+    {1000},
+    {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},
+    {256},        // graph degree
+    {512, 1024},  // visited_size
+    {0.06},
+    {cuvs::distance::DistanceType::L2Expanded},
+    {false},
+    {1000000},
+    {100},
+    {10},
+    {cagra::search_algo::AUTO},
+    {10},
+    {64},
+    {1},
+    {0.2});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
   return inputs;
@@ -336,4 +340,4 @@ inline std::vector<AnnVamanaInputs> generate_inputs()
 
 const std::vector<AnnVamanaInputs> inputs = generate_inputs();
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/tests/neighbors/ann_vamana/test_float_uint32_t.cu b/cpp/tests/neighbors/ann_vamana/test_float_uint32_t.cu
index 9aa9da1b8..7b89b6544 100644
--- a/cpp/tests/neighbors/ann_vamana/test_float_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_vamana/test_float_uint32_t.cu
@@ -18,11 +18,11 @@
 
 #include "../ann_vamana.cuh"
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 typedef AnnVamanaTest<float, float, std::uint32_t> AnnVamanaTestF_U32;
 TEST_P(AnnVamanaTestF_U32, AnnVamana) { this->testVamana(); }
 
 INSTANTIATE_TEST_CASE_P(AnnVamanaTest, AnnVamanaTestF_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/tests/neighbors/ann_vamana/test_int8_t_uint32_t.cu b/cpp/tests/neighbors/ann_vamana/test_int8_t_uint32_t.cu
index 0a6b563b2..843d2274a 100644
--- a/cpp/tests/neighbors/ann_vamana/test_int8_t_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_vamana/test_int8_t_uint32_t.cu
@@ -18,11 +18,11 @@
 
 #include "../ann_vamana.cuh"
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 typedef AnnVamanaTest<float, int8_t, std::uint32_t> AnnVamanaTestI8_U32;
 TEST_P(AnnVamanaTestI8_U32, AnnVamana) { this->testVamana(); }
 
 INSTANTIATE_TEST_CASE_P(AnnVamanaTest, AnnVamanaTestI8_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/cpp/tests/neighbors/ann_vamana/test_uint8_t_uint32_t.cu b/cpp/tests/neighbors/ann_vamana/test_uint8_t_uint32_t.cu
index c0680dc18..f08db0c49 100644
--- a/cpp/tests/neighbors/ann_vamana/test_uint8_t_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_vamana/test_uint8_t_uint32_t.cu
@@ -18,11 +18,11 @@
 
 #include "../ann_vamana.cuh"
 
-namespace cuvs::neighbors::experimental::vamana {
+namespace cuvs::neighbors::vamana {
 
 typedef AnnVamanaTest<float, uint8_t, std::uint32_t> AnnVamanaTestU8_U32;
 TEST_P(AnnVamanaTestU8_U32, AnnVamana) { this->testVamana(); }
 
 INSTANTIATE_TEST_CASE_P(AnnVamanaTest, AnnVamanaTestU8_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace cuvs::neighbors::experimental::vamana
+}  // namespace cuvs::neighbors::vamana
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index ab810ab53..95359558f 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -18,3 +18,4 @@ Nearest Neighbors
    neighbors_nn_descent.rst
    neighbors_refine.rst
    neighbors_mg.rst
+   neighbors_vamana.rst
diff --git a/docs/source/cpp_api/neighbors_vamana.rst b/docs/source/cpp_api/neighbors_vamana.rst
new file mode 100644
index 000000000..25447efce
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_vamana.rst
@@ -0,0 +1,44 @@
+Vamana
+======
+
+Vamana is the graph construction algorithm behind the well-known DiskANN vector search solution. The cuVS implementation of Vamana/DiskANN is a custom GPU-acceleration version of the algorithm that aims to reduce index construction time using NVIDIA GPUs.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <cuvs/neighbors/vamana.hpp>``
+
+namespace *cuvs::neighbors::vamana*
+
+Index build parameters
+----------------------
+
+.. doxygengroup:: vamana_cpp_index_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index
+-----
+
+.. doxygengroup:: vamana_cpp_index
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index build
+-----------
+
+.. doxygengroup:: vamana_cpp_index_build
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: vamana_cpp_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/indexes/indexes.rst b/docs/source/indexes/indexes.rst
index 4a18f793a..8746b84ac 100644
--- a/docs/source/indexes/indexes.rst
+++ b/docs/source/indexes/indexes.rst
@@ -9,6 +9,7 @@ Nearest Neighbor Indexes
    cagra.rst
    ivfflat.rst
    ivfpq.rst
+   vamana.rst
 
 
 Indices and tables
diff --git a/docs/source/indexes/vamana.rst b/docs/source/indexes/vamana.rst
new file mode 100644
index 000000000..7e0a79fb8
--- /dev/null
+++ b/docs/source/indexes/vamana.rst
@@ -0,0 +1,75 @@
+Vamana
+======
+
+VAMANA is the underlying graph construction algorithm used to construct indexes for the DiskANN vector search solution. DiskANN and the Vamana algorithm are described in detail in the `published paper <https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf>`, and a highly optimized `open-source repository <https://github.com/microsoft/DiskANN>`  includes many features for index construction and search. In cuVS, we provide a version of the Vamana algorithm optimized for GPU architectures to accelreate graph construction to build DiskANN idnexes. At a high level, the Vamana algorithm operates as follows:
+
+* 1. Starting with an empty graph, select a medoid vector from the D-dimension vector dataset and insert it into the graph.
+* 2. Iteratively insert batches of dataset vectors into the graph, connecting each inserted vector to neighbors based on a graph traversal.
+* 3. For each batch, create reverse edges and prune unnecessary edges.
+
+There are many algorithmic details that are outlined in the `paper <https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf>`, and many GPU-specific optimizations are included in this implementation.
+
+The current implementation of DiskANN in cuVS only includes the 'in-memory' graph construction and a serialization step that writes the index to a file. This index file can be then used by the `open-source DiskANN <https://github.com/microsoft/DiskANN>` library to perform efficient search. Additional DiskANN functionality, including GPU-accelerated search and 'ssd' index build are planned for future cuVS releases.
+
+[ :doc:`C++ API <../cpp_api/neighbors_vamana>` | :doc:`Python API <../python_api/neighbors_vamana>` ]
+
+Interoperability with CPU DiskANN
+---------------------------------
+
+The 'vamana::serialize' API calls writes the index to a file with a format that is compatible with the `open-source DiskANN repositoriy <https://github.com/microsoft/DiskANN>`. This allows cuVS to be used to accelerate index construction while leveraging the efficient CPU-based search currently available.
+
+Configuration parameters
+------------------------
+
+Build parameters
+~~~~~~~~~~~~~~~~
+
+.. list-table::
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Name
+     - Default
+     - Description
+   * - graph_degree
+     - 32
+     - The maximum degre of the final Vamana graph. The internal representation of the graph includes this many edges for every node, but serialize will compress the graph into a 'CSR' format with, potentially, fewer edges.
+   * - visited_size
+     - 64
+     - Maximum number of visited nodes saved during each traversal to insert a new node. This corresponds to the 'L' parameter in the paper.
+   * - vamana_iters
+     - 1
+     - Number of iterations ran to improve the graph. Each iteration involves inserting every vector in the dataset.
+   * - alpha
+     - 1.2
+     - Alpha parameter that defines how aggressively to prune edges.
+   * - max_fraction
+     - 0.06
+     - Maximum fraction of the dataset that will be inserted as a single batch. Larger max batch size decreases graph quality but improves speed.
+   * - batch_base
+     - 2
+     - Base of growth rate of batch sizes. Insertion batch sizes increase exponentially based on this parameter until max_fraction is reached.
+   * - queue_size
+     - 127
+     - Size of the candidate queue structure used during graph traversal. Must be (2^x)-1 for some x, and must be > visited_size.
+
+Tuning Considerations
+---------------------
+
+The 2 hyper-parameters that are most often tuned are `graph_degree` and `visited_size`. The time needed to create a graph increases dramatically when increasing `graph_degree`, in particular. However, larger graphs may be needed to achieve very high recall search, especially for large datasets.
+
+Memory footprint
+----------------
+
+Vamana builds a graph that is stored in device memory. However, in order to serialize the index and write it to a file for later use, it must be moved into host memory. If the `include_dataset` parameter is also set, then the dataset must be resident in host memory when calling serialize as well.
+
+Device memory usage
+~~~~~~~~~~~~~~~~~~~
+
+The built index represents the graph as fixed degree, storing a total of :math:`graph\_degree * n\_index\_vectors` edges. Graph construction also requires the dataset be in device memory (or it copies it to device during build). In addition, device memory is used during construction to sort and create the reverse edges. Thus, the amount of device memory needed depends on the dataset itself, but it is bounded by a maximum sum of:
+
+- vector dataset: :math:`n\_index\_vectors * n\_dims * sizeof(T)`
+- output graph: :math:`graph\_degree * n\_index\_vectors * sizeof(IdxT)`
+- scratch memory: :math:`n\_index\_vectors * max\_fraction * (2 + graph\_degree) * sizeof(IdxT)`
+
+Reduction in scratch device memory requirements are planned for upcoming releases of cuVS.
diff --git a/examples/cpp/src/vamana_example.cu b/examples/cpp/src/vamana_example.cu
index 60bf14d56..9e5201d31 100644
--- a/examples/cpp/src/vamana_example.cu
+++ b/examples/cpp/src/vamana_example.cu
@@ -29,11 +29,15 @@
 #include "common.cuh"
 
 template <typename T>
-void vamana_build_and_write(raft::device_resources const &dev_resources,
+void vamana_build_and_write(raft::device_resources const& dev_resources,
                             raft::device_matrix_view<const T, int64_t> dataset,
-                            std::string out_fname, int degree, int visited_size,
-                            float max_fraction, int iters) {
-  using namespace cuvs::neighbors::experimental;
+                            std::string out_fname,
+                            int degree,
+                            int visited_size,
+                            float max_fraction,
+                            int iters)
+{
+  using namespace cuvs::neighbors;
 
   // use default index parameters
   vamana::index_params index_params;
@@ -46,13 +50,12 @@ void vamana_build_and_write(raft::device_resources const &dev_resources,
 
   auto start = std::chrono::system_clock::now();
   auto index = vamana::build(dev_resources, index_params, dataset);
-  auto end = std::chrono::system_clock::now();
+  auto end   = std::chrono::system_clock::now();
   std::chrono::duration<double> elapsed_seconds = end - start;
 
   std::cout << "Vamana index has " << index.size() << " vectors" << std::endl;
-  std::cout << "Vamana graph has degree " << index.graph_degree()
-            << ", graph size [" << index.graph().extent(0) << ", "
-            << index.graph().extent(1) << "]" << std::endl;
+  std::cout << "Vamana graph has degree " << index.graph_degree() << ", graph size ["
+            << index.graph().extent(0) << ", " << index.graph().extent(1) << "]" << std::endl;
 
   std::cout << "Time to build index: " << elapsed_seconds.count() << "s\n";
 
@@ -60,9 +63,11 @@ void vamana_build_and_write(raft::device_resources const &dev_resources,
   serialize(dev_resources, out_fname, index);
 }
 
-void usage() {
-  printf("Usage: ./vamana_example <data filename> <output filename> <graph "
-         "degree> <visited_size> <max_fraction> <iterations> \n");
+void usage()
+{
+  printf(
+    "Usage: ./vamana_example <data filename> <output filename> <graph "
+    "degree> <visited_size> <max_fraction> <iterations> \n");
   printf("Input file expected to be binary file of fp32 vectors.\n");
   printf("Graph degree sizes supported: 32, 64, 128, 256\n");
   printf("Visited_size must be > degree and a power of 2.\n");
@@ -71,13 +76,14 @@ void usage() {
   exit(1);
 }
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[])
+{
   raft::device_resources dev_resources;
 
   // Set pool memory resource with 1 GiB initial pool size. All allocations use
   // the same pool.
   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr(
-      rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull);
+    rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull);
   rmm::mr::set_current_device_resource(&pool_mr);
 
   // Alternatively, one could define a pool allocator for temporary arrays (used
@@ -87,22 +93,24 @@ int main(int argc, char *argv[]) {
   // limit. raft::resource::set_workspace_to_pool_resource(dev_resources, 2 *
   // 1024 * 1024 * 1024ull);
 
-  if (argc != 7)
-    usage();
+  if (argc != 7) usage();
 
-  std::string data_fname = (std::string)(argv[1]); // Input filename
-  std::string out_fname = (std::string)argv[2];    // Output index filename
-  int degree = atoi(argv[3]);
-  int max_visited = atoi(argv[4]);
-  float max_fraction = atof(argv[5]);
-  int iters = atoi(argv[6]);
+  std::string data_fname = (std::string)(argv[1]);  // Input filename
+  std::string out_fname  = (std::string)argv[2];    // Output index filename
+  int degree             = atoi(argv[3]);
+  int max_visited        = atoi(argv[4]);
+  float max_fraction     = atof(argv[5]);
+  int iters              = atoi(argv[6]);
 
   // Read in binary dataset file
-  auto dataset =
-      read_bin_dataset<uint8_t, int64_t>(dev_resources, data_fname, INT_MAX);
+  auto dataset = read_bin_dataset<uint8_t, int64_t>(dev_resources, data_fname, INT_MAX);
 
   // Simple build example to create graph and write to a file
-  vamana_build_and_write<uint8_t>(
-      dev_resources, raft::make_const_mdspan(dataset.view()), out_fname, degree,
-      max_visited, max_fraction, iters);
+  vamana_build_and_write<uint8_t>(dev_resources,
+                                  raft::make_const_mdspan(dataset.view()),
+                                  out_fname,
+                                  degree,
+                                  max_visited,
+                                  max_fraction,
+                                  iters);
 }

From c778c88118d829404630bbe71d139d688a591bcb Mon Sep 17 00:00:00 2001
From: rhdong <rhdong2017@gmail.com>
Date: Thu, 30 Jan 2025 16:29:15 -0800
Subject: [PATCH 3/4] [Feat] Support `bitset` filter for Brute Force (#560)

Authors:
  - rhdong (https://github.com/rhdong)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/560
---
 cpp/include/cuvs/neighbors/brute_force.hpp    | 165 +++++-
 cpp/include/cuvs/neighbors/common.hpp         |  34 +-
 cpp/src/neighbors/brute_force_c.cpp           |  14 +-
 cpp/src/neighbors/detail/knn_brute_force.cuh  | 122 +++--
 cpp/src/neighbors/sample_filter.cuh           |  16 +
 .../neighbors/brute_force_prefiltered.cu      | 496 +++++++++++++++++-
 python/cuvs/cuvs/tests/test_brute_force.py    |   2 +-
 7 files changed, 750 insertions(+), 99 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp
index 99581469f..72a5cac12 100644
--- a/cpp/include/cuvs/neighbors/brute_force.hpp
+++ b/cpp/include/cuvs/neighbors/brute_force.hpp
@@ -332,15 +332,28 @@ auto build(raft::resources const& handle,
  * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
  * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
  * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
+ * eliminate entirely allocations happening within `search`.
+ *
+ * Usage example:
  * @code{.cpp}
- *   ...
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   brute_force::search(handle, index, queries1, out_inds1, out_dists1);
- *   brute_force::search(handle, index, queries2, out_inds2, out_dists2);
- *   brute_force::search(handle, index, queries3, out_inds3, out_dists3);
- *   ...
+ *   using namespace cuvs::neighbors;
+ *
+ *   // use default index parameters
+ *   brute_force::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   brute_force::index_params index_params;
+ *   auto index = brute_force::build(handle, index_params, dataset);
+ *   // use default search parameters
+ *   brute_force::search_params search_params;
+ *   // create a bitset to filter the search
+ *   auto removed_indices = raft::make_device_vector<int64_t, int64_t>(res, n_removed_indices);
+ *   raft::core::bitset<std::uint32_t, int64_t> removed_indices_bitset(
+ *     res, removed_indices.view(), dataset.extent(0));
+ *   // search K nearest neighbours according to a bitset
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
+ *   auto filter    = filtering::bitset_filter(removed_indices_bitset.view());
+ *   brute_force::search(res, search_params, index, queries, neighbors, distances, filter);
  * @endcode
  *
  * @param[in] handle
@@ -350,9 +363,17 @@ auto build(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] sample_filter An optional device bitmap filter function with a `row-major` layout and
- * the shape of [n_queries, index->size()], which means the filter will use the first
- * `index->size()` bits to indicate whether queries[0] should compute the distance with dataset.
+ * @param[in] sample_filter An optional device filter that restricts which dataset elements should
+ * be considered for each query.
+ *
+ * - Supports two types of filters:
+ *   1. **Bitset Filter**: A shared filter where each bit corresponds to a dataset element.
+ *      All queries share the same filter, with a logical shape of `[1, index->size()]`.
+ *   2. **Bitmap Filter**: A per-query filter with a logical shape of `[n_queries, index->size()]`,
+ *      where each bit indicates whether a specific dataset element should be considered for a
+ *      particular query. (1 for inclusion, 0 for exclusion).
+ *
+ * - The default value is `none_sample_filter`, which applies no filtering.
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::brute_force::search_params& params,
@@ -379,15 +400,28 @@ void search(raft::resources const& handle,
  * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
  * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
  * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
+ * eliminate entirely allocations happening within `search`.
+ *
+ * Usage example:
  * @code{.cpp}
- *   ...
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   brute_force::search(handle, index, queries1, out_inds1, out_dists1);
- *   brute_force::search(handle, index, queries2, out_inds2, out_dists2);
- *   brute_force::search(handle, index, queries3, out_inds3, out_dists3);
- *   ...
+ *   using namespace cuvs::neighbors;
+ *
+ *   // use default index parameters
+ *   brute_force::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   brute_force::index_params index_params;
+ *   auto index = brute_force::build(handle, index_params, dataset);
+ *   // use default search parameters
+ *   brute_force::search_params search_params;
+ *   // create a bitset to filter the search
+ *   auto removed_indices = raft::make_device_vector<int64_t, int64_t>(res, n_removed_indices);
+ *   raft::core::bitset<std::uint32_t, int64_t> removed_indices_bitset(
+ *     res, removed_indices.view(), dataset.extent(0));
+ *   // search K nearest neighbours according to a bitset
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<half>(res, n_queries, k);
+ *   auto filter    = filtering::bitset_filter(removed_indices_bitset.view());
+ *   brute_force::search(res, search_params, index, queries, neighbors, distances, filter);
  * @endcode
  *
  * @param[in] handle
@@ -397,8 +431,17 @@ void search(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] sample_filter a optional device bitmap filter function that greenlights samples for a
- * given
+ * @param[in] sample_filter An optional device filter that restricts which dataset elements should
+ * be considered for each query.
+ *
+ * - Supports two types of filters:
+ *   1. **Bitset Filter**: A shared filter where each bit corresponds to a dataset element.
+ *      All queries share the same filter, with a logical shape of `[1, index->size()]`.
+ *   2. **Bitmap Filter**: A per-query filter with a logical shape of `[n_queries, index->size()]`,
+ *      where each bit indicates whether a specific dataset element should be considered for a
+ *      particular query. (1 for inclusion, 0 for exclusion).
+ *
+ * - The default value is `none_sample_filter`, which applies no filtering.
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::brute_force::search_params& params,
@@ -421,6 +464,33 @@ void search(raft::resources const& handle,
  *
  * See the [brute_force::build](#brute_force::build) documentation for a usage example.
  *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *
+ *   // use default index parameters
+ *   brute_force::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   brute_force::index_params index_params;
+ *   auto index = brute_force::build(handle, index_params, dataset);
+ *   // use default search parameters
+ *   brute_force::search_params search_params;
+ *   // create a bitset to filter the search
+ *   auto removed_indices = raft::make_device_vector<int64_t, int64_t>(res, n_removed_indices);
+ *   raft::core::bitset<std::uint32_t, int64_t> removed_indices_bitset(
+ *     res, removed_indices.view(), dataset.extent(0));
+ *   // search K nearest neighbours according to a bitset
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
+ *   auto filter    = filtering::bitset_filter(removed_indices_bitset.view());
+ *   brute_force::search(res, search_params, index, queries, neighbors, distances, filter);
+ * @endcode
+ *
  * @param[in] handle
  * @param[in] params parameters configuring the search
  * @param[in] index bruteforce constructed index
@@ -428,8 +498,17 @@ void search(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] sample_filter an optional device bitmap filter function that greenlights samples for a
- * given query
+ * @param[in] sample_filter An optional device filter that restricts which dataset elements should
+ * be considered for each query.
+ *
+ * - Supports two types of filters:
+ *   1. **Bitset Filter**: A shared filter where each bit corresponds to a dataset element.
+ *      All queries share the same filter, with a logical shape of `[1, index->size()]`.
+ *   2. **Bitmap Filter**: A per-query filter with a logical shape of `[n_queries, index->size()]`,
+ *      where each bit indicates whether a specific dataset element should be considered for a
+ *      particular query. (1 for inclusion, 0 for exclusion).
+ *
+ * - The default value is `none_sample_filter`, which applies no filtering.
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::brute_force::search_params& params,
@@ -452,6 +531,33 @@ void search(raft::resources const& handle,
  *
  * See the [brute_force::build](#brute_force::build) documentation for a usage example.
  *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *
+ *   // use default index parameters
+ *   brute_force::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   brute_force::index_params index_params;
+ *   auto index = brute_force::build(handle, index_params, dataset);
+ *   // use default search parameters
+ *   brute_force::search_params search_params;
+ *   // create a bitset to filter the search
+ *   auto removed_indices = raft::make_device_vector<int64_t, int64_t>(res, n_removed_indices);
+ *   raft::core::bitset<std::uint32_t, int64_t> removed_indices_bitset(
+ *     res, removed_indices.view(), dataset.extent(0));
+ *   // search K nearest neighbours according to a bitset
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<half>(res, n_queries, k);
+ *   auto filter    = filtering::bitset_filter(removed_indices_bitset.view());
+ *   brute_force::search(res, search_params, index, queries, neighbors, distances, filter);
+ * @endcode
+ *
  * @param[in] handle
  * @param[in] params parameters configuring the search
  * @param[in] index bruteforce constructed index
@@ -459,8 +565,17 @@ void search(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] sample_filter an optional device bitmap filter function that greenlights samples for a
- * given query
+ * @param[in] sample_filter An optional device filter that restricts which dataset elements should
+ * be considered for each query.
+ *
+ * - Supports two types of filters:
+ *   1. **Bitset Filter**: A shared filter where each bit corresponds to a dataset element.
+ *      All queries share the same filter, with a logical shape of `[1, index->size()]`.
+ *   2. **Bitmap Filter**: A per-query filter with a logical shape of `[n_queries, index->size()]`,
+ *      where each bit indicates whether a specific dataset element should be considered for a
+ *      particular query. (1 for inclusion, 0 for exclusion).
+ *
+ * - The default value is `none_sample_filter`, which applies no filtering.
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::brute_force::search_params& params,
diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index bd9ea4834..5dc99a4e8 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -18,6 +18,7 @@
 
 #include <cstdint>
 #include <cuvs/distance/distance.hpp>
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
@@ -456,8 +457,11 @@ inline constexpr bool is_vpq_dataset_v = is_vpq_dataset<DatasetT>::value;
 
 namespace filtering {
 
+enum class FilterType { None, Bitmap, Bitset };
+
 struct base_filter {
-  virtual ~base_filter() = default;
+  virtual ~base_filter()                     = default;
+  virtual FilterType get_filter_type() const = 0;
 };
 
 /* A filter that filters nothing. This is the default behavior. */
@@ -475,6 +479,8 @@ struct none_sample_filter : public base_filter {
     const uint32_t query_ix,
     // the index of the current sample
     const uint32_t sample_ix) const;
+
+  FilterType get_filter_type() const override { return FilterType::None; }
 };
 
 /**
@@ -513,15 +519,24 @@ struct ivf_to_sample_filter {
  */
 template <typename bitmap_t, typename index_t>
 struct bitmap_filter : public base_filter {
+  using view_t = cuvs::core::bitmap_view<bitmap_t, index_t>;
+
   // View of the bitset to use as a filter
-  const cuvs::core::bitmap_view<bitmap_t, index_t> bitmap_view_;
+  const view_t bitmap_view_;
 
-  bitmap_filter(const cuvs::core::bitmap_view<bitmap_t, index_t> bitmap_for_filtering);
+  bitmap_filter(const view_t bitmap_for_filtering);
   inline _RAFT_HOST_DEVICE bool operator()(
     // query index
     const uint32_t query_ix,
     // the index of the current sample
     const uint32_t sample_ix) const;
+
+  FilterType get_filter_type() const override { return FilterType::Bitmap; }
+
+  view_t view() const { return bitmap_view_; }
+
+  template <typename csr_matrix_t>
+  void to_csr(raft::resources const& handle, csr_matrix_t& csr);
 };
 
 /**
@@ -532,15 +547,24 @@ struct bitmap_filter : public base_filter {
  */
 template <typename bitset_t, typename index_t>
 struct bitset_filter : public base_filter {
+  using view_t = cuvs::core::bitset_view<bitset_t, index_t>;
+
   // View of the bitset to use as a filter
-  const cuvs::core::bitset_view<bitset_t, index_t> bitset_view_;
+  const view_t bitset_view_;
 
-  bitset_filter(const cuvs::core::bitset_view<bitset_t, index_t> bitset_for_filtering);
+  bitset_filter(const view_t bitset_for_filtering);
   inline _RAFT_HOST_DEVICE bool operator()(
     // query index
     const uint32_t query_ix,
     // the index of the current sample
     const uint32_t sample_ix) const;
+
+  FilterType get_filter_type() const override { return FilterType::Bitset; }
+
+  view_t view() const { return bitset_view_; }
+
+  template <typename csr_matrix_t>
+  void to_csr(raft::resources const& handle, csr_matrix_t& csr);
 };
 
 /**
diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp
index 1693ac930..98c74e285 100644
--- a/cpp/src/neighbors/brute_force_c.cpp
+++ b/cpp/src/neighbors/brute_force_c.cpp
@@ -67,8 +67,8 @@ void _search(cuvsResources_t res,
   using queries_mdspan_type   = raft::device_matrix_view<T const, int64_t, QueriesLayoutT>;
   using neighbors_mdspan_type = raft::device_matrix_view<int64_t, int64_t, raft::row_major>;
   using distances_mdspan_type = raft::device_matrix_view<float, int64_t, raft::row_major>;
-  using prefilter_mds_type    = raft::device_vector_view<const uint32_t, int64_t>;
-  using prefilter_bmp_type    = cuvs::core::bitmap_view<const uint32_t, int64_t>;
+  using prefilter_mds_type    = raft::device_vector_view<uint32_t, int64_t>;
+  using prefilter_bmp_type    = cuvs::core::bitmap_view<uint32_t, int64_t>;
 
   auto queries_mds   = cuvs::core::from_dlpack<queries_mdspan_type>(queries_tensor);
   auto neighbors_mds = cuvs::core::from_dlpack<neighbors_mdspan_type>(neighbors_tensor);
@@ -85,14 +85,14 @@ void _search(cuvsResources_t res,
                                          distances_mds,
                                          cuvs::neighbors::filtering::none_sample_filter{});
   } else if (prefilter.type == BITMAP) {
-    auto prefilter_ptr  = reinterpret_cast<DLManagedTensor*>(prefilter.addr);
-    auto prefilter_mds  = cuvs::core::from_dlpack<prefilter_mds_type>(prefilter_ptr);
-    auto prefilter_view = cuvs::neighbors::filtering::bitmap_filter(
-      prefilter_bmp_type((const uint32_t*)prefilter_mds.data_handle(),
+    auto prefilter_ptr   = reinterpret_cast<DLManagedTensor*>(prefilter.addr);
+    auto prefilter_mds   = cuvs::core::from_dlpack<prefilter_mds_type>(prefilter_ptr);
+    const auto prefilter = cuvs::neighbors::filtering::bitmap_filter(
+      prefilter_bmp_type((uint32_t*)prefilter_mds.data_handle(),
                          queries_mds.extent(0),
                          index_ptr->dataset().extent(0)));
     cuvs::neighbors::brute_force::search(
-      *res_ptr, params, *index_ptr, queries_mds, neighbors_mds, distances_mds, prefilter_view);
+      *res_ptr, params, *index_ptr, queries_mds, neighbors_mds, distances_mds, prefilter);
   } else {
     RAFT_FAIL("Unsupported prefilter type: BITSET");
   }
diff --git a/cpp/src/neighbors/detail/knn_brute_force.cuh b/cpp/src/neighbors/detail/knn_brute_force.cuh
index f1976e002..5caf84cc7 100644
--- a/cpp/src/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/src/neighbors/detail/knn_brute_force.cuh
@@ -56,9 +56,12 @@
 
 #include <cstdint>
 #include <iostream>
+#include <optional>
 #include <set>
+#include <variant>
 
 namespace cuvs::neighbors::detail {
+
 /**
  * Calculates brute force knn, using a fixed memory budget
  * by tiling over both the rows and columns of pairwise_distances
@@ -82,8 +85,10 @@ void tiled_brute_force_knn(const raft::resources& handle,
                            size_t max_col_tile_size                  = 0,
                            const DistanceT* precomputed_index_norms  = nullptr,
                            const DistanceT* precomputed_search_norms = nullptr,
-                           const uint32_t* filter_bitmap             = nullptr,
-                           DistanceEpilogue distance_epilogue        = raft::identity_op())
+                           const uint32_t* filter_bits               = nullptr,
+                           DistanceEpilogue distance_epilogue        = raft::identity_op(),
+                           cuvs::neighbors::filtering::FilterType filter_type =
+                             cuvs::neighbors::filtering::FilterType::Bitmap)
 {
   // Figure out the number of rows/cols to tile for
   size_t tile_rows = 0;
@@ -245,21 +250,23 @@ void tiled_brute_force_knn(const raft::resources& handle,
         }
       }
 
-      if (filter_bitmap != nullptr) {
-        auto distances_ptr        = temp_distances.data();
-        auto count                = thrust::make_counting_iterator<IndexType>(0);
-        DistanceT masked_distance = select_min ? std::numeric_limits<DistanceT>::infinity()
-                                               : std::numeric_limits<DistanceT>::lowest();
+      auto distances_ptr        = temp_distances.data();
+      auto count                = thrust::make_counting_iterator<IndexType>(0);
+      DistanceT masked_distance = select_min ? std::numeric_limits<DistanceT>::infinity()
+                                             : std::numeric_limits<DistanceT>::lowest();
+
+      if (filter_bits != nullptr) {
+        size_t n_cols = filter_type == cuvs::neighbors::filtering::FilterType::Bitmap ? n : 0;
         thrust::for_each(raft::resource::get_thrust_policy(handle),
                          count,
                          count + current_query_size * current_centroid_size,
                          [=] __device__(IndexType idx) {
                            IndexType row      = i + (idx / current_centroid_size);
                            IndexType col      = j + (idx % current_centroid_size);
-                           IndexType g_idx    = row * n + col;
+                           IndexType g_idx    = row * n_cols + col;
                            IndexType item_idx = (g_idx) >> 5;
                            uint32_t bit_idx   = (g_idx)&31;
-                           uint32_t filter    = filter_bitmap[item_idx];
+                           uint32_t filter    = filter_bits[item_idx];
                            if ((filter & (uint32_t(1) << bit_idx)) == 0) {
                              distances_ptr[idx] = masked_distance;
                            }
@@ -575,12 +582,12 @@ void brute_force_search(
     query_norms ? query_norms->data_handle() : nullptr);
 }
 
-template <typename T, typename IdxT, typename BitmapT, typename DistanceT = float>
+template <typename T, typename IdxT, typename BitsT, typename DistanceT = float>
 void brute_force_search_filtered(
   raft::resources const& res,
   const cuvs::neighbors::brute_force::index<T, DistanceT>& idx,
   raft::device_matrix_view<const T, IdxT, raft::row_major> queries,
-  cuvs::core::bitmap_view<const BitmapT, IdxT> filter,
+  const cuvs::neighbors::filtering::base_filter* filter,
   raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,
   raft::device_matrix_view<DistanceT, IdxT, raft::row_major> distances,
   std::optional<raft::device_vector_view<const DistanceT, IdxT>> query_norms = std::nullopt)
@@ -601,29 +608,42 @@ void brute_force_search_filtered(
                                     metric == cuvs::distance::DistanceType::CosineExpanded),
                "Index must has norms when using Euclidean, IP, and Cosine!");
 
-  IdxT n_queries = queries.extent(0);
-  IdxT n_dataset = idx.dataset().extent(0);
-  IdxT dim       = idx.dataset().extent(1);
-  IdxT k         = neighbors.extent(1);
+  IdxT n_queries                                     = queries.extent(0);
+  IdxT n_dataset                                     = idx.dataset().extent(0);
+  IdxT dim                                           = idx.dataset().extent(1);
+  IdxT k                                             = neighbors.extent(1);
+  cuvs::neighbors::filtering::FilterType filter_type = filter->get_filter_type();
 
   auto stream = raft::resource::get_cuda_stream(res);
 
-  // calc nnz
-  IdxT nnz_h = 0;
-  rmm::device_scalar<IdxT> nnz(0, stream);
-  auto nnz_view = raft::make_device_scalar_view<IdxT>(nnz.data());
-  auto filter_view =
-    raft::make_device_vector_view<const BitmapT, IdxT>(filter.data(), filter.n_elements());
-  IdxT size_h    = n_queries * n_dataset;
-  auto size_view = raft::make_host_scalar_view<const IdxT, IdxT>(&size_h);
-
-  raft::popc(res, filter_view, size_view, nnz_view);
-  raft::copy(&nnz_h, nnz.data(), 1, stream);
+  std::optional<std::variant<const cuvs::core::bitmap_view<BitsT, IdxT>,
+                             const cuvs::core::bitset_view<BitsT, IdxT>>>
+    filter_view;
+
+  IdxT nnz_h     = 0;
+  float sparsity = 0.0f;
+
+  const BitsT* filter_data = nullptr;
+
+  if (filter_type == cuvs::neighbors::filtering::FilterType::Bitmap) {
+    auto actual_filter =
+      dynamic_cast<const cuvs::neighbors::filtering::bitmap_filter<BitsT, int64_t>*>(filter);
+    filter_view.emplace(actual_filter->view());
+    nnz_h    = actual_filter->view().count(res);
+    sparsity = 1.0 - nnz_h / (1.0 * n_queries * n_dataset);
+  } else if (filter_type == cuvs::neighbors::filtering::FilterType::Bitset) {
+    auto actual_filter =
+      dynamic_cast<const cuvs::neighbors::filtering::bitset_filter<BitsT, int64_t>*>(filter);
+    filter_view.emplace(actual_filter->view());
+    nnz_h    = n_queries * actual_filter->view().count(res);
+    sparsity = 1.0 - nnz_h / (1.0 * n_queries * n_dataset);
+  } else {
+    RAFT_FAIL("Unsupported sample filter type");
+  }
 
-  raft::resource::sync_stream(res, stream);
-  float sparsity = (1.0f * nnz_h / (1.0f * n_queries * n_dataset));
+  std::visit([&](const auto& actual_view) { filter_data = actual_view.data(); }, *filter_view);
 
-  if (sparsity > 0.01f) {
+  if (sparsity < 0.9f) {
     raft::resources stream_pool_handle(res);
     raft::resource::set_cuda_stream(stream_pool_handle, stream);
     auto idx_norm = idx.has_norms() ? const_cast<DistanceT*>(idx.norms().data_handle()) : nullptr;
@@ -643,12 +663,12 @@ void brute_force_search_filtered(
                                               0,
                                               idx_norm,
                                               nullptr,
-                                              filter.data());
+                                              filter_data,
+                                              raft::identity_op(),
+                                              filter_type);
   } else {
     auto csr = raft::make_device_csr_matrix<DistanceT, IdxT>(res, n_queries, n_dataset, nnz_h);
-
-    // fill csr
-    raft::sparse::convert::bitmap_to_csr(res, filter, csr);
+    std::visit([&](const auto& actual_view) { actual_view.to_csr(res, csr); }, *filter_view);
 
     // create filter csr view
     auto compressed_csr_view = csr.structure_view();
@@ -664,7 +684,11 @@ void brute_force_search_filtered(
     auto csr_view = raft::make_device_csr_matrix_view<DistanceT, IdxT, IdxT, IdxT>(
       csr.get_elements().data(), compressed_csr_view);
 
-    raft::sparse::linalg::masked_matmul(res, queries, dataset_view, filter, csr_view);
+    std::visit(
+      [&](const auto& actual_view) {
+        raft::sparse::linalg::masked_matmul(res, queries, dataset_view, actual_view, csr_view);
+      },
+      *filter_view);
 
     // post process
     std::optional<raft::device_vector<DistanceT, IdxT>> query_norms_;
@@ -733,21 +757,27 @@ void search(raft::resources const& res,
     return brute_force_search<T, int64_t, DistT>(res, idx, queries, neighbors, distances);
   } catch (const std::bad_cast&) {
   }
+  if constexpr (std::is_same_v<LayoutT, raft::col_major>) {
+    RAFT_FAIL("filtered search isn't available with col_major queries yet");
+  } else {
+    try {
+      auto& sample_filter =
+        dynamic_cast<const cuvs::neighbors::filtering::bitmap_filter<uint32_t, int64_t>&>(
+          sample_filter_ref);
+      return brute_force_search_filtered<T, int64_t, uint32_t, DistT>(
+        res, idx, queries, &sample_filter, neighbors, distances);
+    } catch (const std::bad_cast&) {
+    }
 
-  try {
-    auto& sample_filter =
-      dynamic_cast<const cuvs::neighbors::filtering::bitmap_filter<const uint32_t, int64_t>&>(
-        sample_filter_ref);
-    if constexpr (std::is_same_v<LayoutT, raft::col_major>) {
-      RAFT_FAIL("filtered search isn't available with col_major queries yet");
-    } else {
-      cuvs::core::bitmap_view<const uint32_t, int64_t> sample_filter_view =
-        sample_filter.bitmap_view_;
+    try {
+      auto& sample_filter =
+        dynamic_cast<const cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t>&>(
+          sample_filter_ref);
       return brute_force_search_filtered<T, int64_t, uint32_t, DistT>(
-        res, idx, queries, sample_filter_view, neighbors, distances);
+        res, idx, queries, &sample_filter, neighbors, distances);
+    } catch (const std::bad_cast&) {
+      RAFT_FAIL("Unsupported sample filter type");
     }
-  } catch (const std::bad_cast&) {
-    RAFT_FAIL("Unsupported sample filter type");
   }
 }
 
diff --git a/cpp/src/neighbors/sample_filter.cuh b/cpp/src/neighbors/sample_filter.cuh
index 258116ed3..b0c61f924 100644
--- a/cpp/src/neighbors/sample_filter.cuh
+++ b/cpp/src/neighbors/sample_filter.cuh
@@ -20,6 +20,7 @@
 #include <raft/core/bitmap.cuh>
 #include <raft/core/bitset.cuh>
 #include <raft/core/detail/macros.hpp>
+#include <raft/sparse/convert/csr.cuh>
 
 #include <cstddef>
 #include <cstdint>
@@ -108,6 +109,13 @@ inline _RAFT_HOST_DEVICE bool bitset_filter<bitset_t, index_t>::operator()(
   return bitset_view_.test(sample_ix);
 }
 
+template <typename bitset_t, typename index_t>
+template <typename csr_matrix_t>
+void bitset_filter<bitset_t, index_t>::to_csr(raft::resources const& handle, csr_matrix_t& csr)
+{
+  raft::sparse::convert::bitset_to_csr(handle, bitset_view_, csr);
+}
+
 template <typename bitmap_t, typename index_t>
 bitmap_filter<bitmap_t, index_t>::bitmap_filter(
   const cuvs::core::bitmap_view<bitmap_t, index_t> bitmap_for_filtering)
@@ -124,4 +132,12 @@ inline _RAFT_HOST_DEVICE bool bitmap_filter<bitmap_t, index_t>::operator()(
 {
   return bitmap_view_.test(query_ix, sample_ix);
 }
+
+template <typename bitmap_t, typename index_t>
+template <typename csr_matrix_t>
+void bitmap_filter<bitmap_t, index_t>::to_csr(raft::resources const& handle, csr_matrix_t& csr)
+{
+  raft::sparse::convert::bitmap_to_csr(handle, bitmap_view_, csr);
+}
+
 }  // namespace cuvs::neighbors::filtering
diff --git a/cpp/tests/neighbors/brute_force_prefiltered.cu b/cpp/tests/neighbors/brute_force_prefiltered.cu
index 12b1c529e..bf7dce7ee 100644
--- a/cpp/tests/neighbors/brute_force_prefiltered.cu
+++ b/cpp/tests/neighbors/brute_force_prefiltered.cu
@@ -28,6 +28,7 @@
 #include <raft/random/rng_state.hpp>
 #include <raft/util/popc.cuh>
 
+#include <cusparse.h>
 #include <gtest/gtest.h>
 
 #include <cuda_fp16.h>
@@ -146,11 +147,27 @@ void set_bitmap(const index_t* src,
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
+bool isCuSparseVersionGreaterThan_12_0_1()
+{
+  int version;
+  cusparseHandle_t handle;
+  cusparseCreate(&handle);
+  cusparseGetVersion(handle, &version);
+
+  int major = version / 1000;
+  int minor = (version % 1000) / 100;
+  int patch = version % 100;
+
+  cusparseDestroy(handle);
+
+  return (major > 12) || (major == 12 && minor > 0) || (major == 12 && minor == 0 && patch >= 2);
+}
+
 template <typename value_t, typename dist_t, typename index_t, typename bitmap_t = uint32_t>
-class PrefilteredBruteForceTest
+class PrefilteredBruteForceOnBitmapTest
   : public ::testing::TestWithParam<PrefilteredBruteForceInputs<index_t>> {
  public:
-  PrefilteredBruteForceTest()
+  PrefilteredBruteForceOnBitmapTest()
     : stream(raft::resource::get_cuda_stream(handle)),
       params(::testing::TestWithParam<PrefilteredBruteForceInputs<index_t>>::GetParam()),
       filter_d(0, stream),
@@ -352,6 +369,9 @@ class PrefilteredBruteForceTest
 
   void SetUp() override
   {
+    if (std::is_same_v<value_t, half> && !isCuSparseVersionGreaterThan_12_0_1()) {
+      GTEST_SKIP() << "Skipping all tests for half-float as cuSparse doesn't support it.";
+    }
     index_t element =
       raft::ceildiv(params.n_queries * params.n_dataset, index_t(sizeof(bitmap_t) * 8));
     std::vector<bitmap_t> filter_h(element);
@@ -476,8 +496,6 @@ class PrefilteredBruteForceTest
     out_val_expected_d.resize(params.n_queries * params.top_k, stream);
     out_idx_expected_d.resize(params.n_queries * params.top_k, stream);
 
-    // dump_vector(out_val_h.data(), out_val_h.size(), "out_val_h");
-
     raft::update_device(out_val_expected_d.data(), out_val_h.data(), out_val_h.size(), stream);
     raft::update_device(out_idx_expected_d.data(), out_idx_h.data(), out_idx_h.size(), stream);
 
@@ -494,8 +512,8 @@ class PrefilteredBruteForceTest
 
     auto dataset = brute_force::build(handle, dataset_raw, params.metric);
 
-    auto filter = cuvs::core::bitmap_view<const bitmap_t, index_t>(
-      (const bitmap_t*)filter_d.data(), params.n_queries, params.n_dataset);
+    auto filter = cuvs::core::bitmap_view<bitmap_t, index_t>(
+      (bitmap_t*)filter_d.data(), params.n_queries, params.n_dataset);
 
     auto out_val = raft::make_device_matrix_view<dist_t, index_t, raft::row_major>(
       out_val_d.data(), params.n_queries, params.top_k);
@@ -544,11 +562,451 @@ class PrefilteredBruteForceTest
   rmm::device_uvector<index_t> out_idx_expected_d;
 };
 
-using PrefilteredBruteForceTest_float_int64 = PrefilteredBruteForceTest<float, float, int64_t>;
-TEST_P(PrefilteredBruteForceTest_float_int64, Result) { Run(); }
+template <typename value_t, typename dist_t, typename index_t, typename bitset_t = uint32_t>
+class PrefilteredBruteForceOnBitsetTest
+  : public ::testing::TestWithParam<PrefilteredBruteForceInputs<index_t>> {
+ public:
+  PrefilteredBruteForceOnBitsetTest()
+    : stream(raft::resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<PrefilteredBruteForceInputs<index_t>>::GetParam()),
+      filter_d(0, stream),
+      dataset_d(0, stream),
+      queries_d(0, stream),
+      out_val_d(0, stream),
+      out_val_expected_d(0, stream),
+      out_idx_d(0, stream),
+      out_idx_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  void repeat_cpu_bitset(std::vector<bitset_t>& input,
+                         size_t input_bits,
+                         size_t repeat,
+                         std::vector<bitset_t>& output)
+  {
+    const size_t output_bits  = input_bits * repeat;
+    const size_t output_units = (output_bits + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);
+
+    std::memset(output.data(), 0, output_units * sizeof(bitset_t));
+
+    size_t output_bit_index = 0;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bitset_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bitset_t) * 8);
+        bool bit                = (input[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bitset_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bitset_t) * 8);
+
+        output[output_unit_index] |= (static_cast<bitset_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
+  }
+
+  index_t create_sparse_matrix_with_rmat(index_t m,
+                                         index_t n,
+                                         float sparsity,
+                                         rmm::device_uvector<bitset_t>& filter_d)
+  {
+    index_t r_scale   = (index_t)std::log2(m);
+    index_t c_scale   = (index_t)std::log2(n);
+    index_t n_edges   = (index_t)(m * n * 1.0f * sparsity);
+    index_t max_scale = std::max(r_scale, c_scale);
+
+    rmm::device_uvector<index_t> out_src{(unsigned long)n_edges, stream};
+    rmm::device_uvector<index_t> out_dst{(unsigned long)n_edges, stream};
+    rmm::device_uvector<float> theta{(unsigned long)(4 * max_scale), stream};
+
+    raft::random::RngState state{2024ULL, raft::random::GeneratorType::GenPC};
+
+    raft::random::uniform<float>(handle, state, theta.data(), theta.size(), 0.0f, 1.0f);
+    normalize<float, float>(
+      theta.data(), theta.data(), max_scale, r_scale, c_scale, r_scale != c_scale, true, stream);
+    raft::random::rmat_rectangular_gen((index_t*)nullptr,
+                                       out_src.data(),
+                                       out_dst.data(),
+                                       theta.data(),
+                                       r_scale,
+                                       c_scale,
+                                       n_edges,
+                                       stream,
+                                       state);
+
+    index_t nnz_h = 0;
+    {
+      auto src    = out_src.data();
+      auto dst    = out_dst.data();
+      auto bitset = filter_d.data();
+      rmm::device_scalar<index_t> nnz(0, stream);
+      auto nnz_view = raft::make_device_scalar_view<index_t>(nnz.data());
+      auto filter_view =
+        raft::make_device_vector_view<const uint32_t, index_t>(filter_d.data(), filter_d.size());
+      index_t size_h = m * n;
+      auto size_view = raft::make_host_scalar_view<const index_t, index_t>(&size_h);
+
+      set_bitmap(src, dst, bitset, n_edges, n, stream);
+
+      raft::popc(handle, filter_view, size_view, nnz_view);
+      raft::copy(&nnz_h, nnz.data(), 1, stream);
+
+      raft::resource::sync_stream(handle, stream);
+    }
+
+    return nnz_h;
+  }
+
+  void cpu_convert_to_csr(std::vector<bitset_t>& bitset,
+                          index_t rows,
+                          index_t cols,
+                          std::vector<index_t>& indices,
+                          std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    index_t index        = 0;
+    bitset_t element     = 0;
+    index_t bit_position = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        index        = i * cols + j;
+        element      = bitset[index / (8 * sizeof(bitset_t))];
+        bit_position = index % (8 * sizeof(bitset_t));
+
+        if (((element >> bit_position) & 1)) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  void cpu_sddmm(const std::vector<dist_t>& A,
+                 const std::vector<dist_t>& B,
+                 std::vector<dist_t>& vals,
+                 const std::vector<index_t>& cols,
+                 const std::vector<index_t>& row_ptrs,
+                 bool is_row_major_A,
+                 bool is_row_major_B,
+                 dist_t alpha = 1.0,
+                 dist_t beta  = 0.0)
+  {
+    if (params.n_queries * params.dim != static_cast<index_t>(A.size()) ||
+        params.dim * params.n_dataset != static_cast<index_t>(B.size())) {
+      std::cerr << "Matrix dimensions and vector size do not match!" << std::endl;
+      return;
+    }
 
-using PrefilteredBruteForceTest_half_int64 = PrefilteredBruteForceTest<half, float, int64_t>;
-TEST_P(PrefilteredBruteForceTest_half_int64, Result) { Run(); }
+    bool trans_a = is_row_major_A;
+    bool trans_b = is_row_major_B;
+
+    for (index_t i = 0; i < params.n_queries; ++i) {
+      for (index_t j = row_ptrs[i]; j < row_ptrs[i + 1]; ++j) {
+        dist_t sum     = 0;
+        dist_t norms_A = 0;
+        dist_t norms_B = 0;
+
+        for (index_t l = 0; l < params.dim; ++l) {
+          index_t a_index = trans_a ? i * params.dim + l : l * params.n_queries + i;
+          index_t b_index = trans_b ? l * params.n_dataset + cols[j] : cols[j] * params.dim + l;
+          dist_t A_v;
+          dist_t B_v;
+          if constexpr (sizeof(value_t) == 2) {
+            A_v = __half2float(__float2half(A[a_index]));
+            B_v = __half2float(__float2half(B[b_index]));
+          } else {
+            A_v = A[a_index];
+            B_v = B[b_index];
+          }
+
+          sum += A_v * B_v;
+
+          norms_A += A_v * A_v;
+          norms_B += B_v * B_v;
+        }
+        vals[j] = alpha * sum + beta * vals[j];
+        if (params.metric == cuvs::distance::DistanceType::L2Expanded) {
+          vals[j] = dist_t(-2.0) * vals[j] + norms_A + norms_B;
+        } else if (params.metric == cuvs::distance::DistanceType::L2SqrtExpanded) {
+          vals[j] = std::sqrt(dist_t(-2.0) * vals[j] + norms_A + norms_B);
+        } else if (params.metric == cuvs::distance::DistanceType::CosineExpanded) {
+          vals[j] = dist_t(1.0) - vals[j] / std::sqrt(norms_A * norms_B);
+        }
+      }
+    }
+  }
+
+  void cpu_select_k(const std::vector<index_t>& indptr_h,
+                    const std::vector<index_t>& indices_h,
+                    const std::vector<dist_t>& values_h,
+                    std::optional<std::vector<index_t>>& in_idx_h,
+                    index_t n_queries,
+                    index_t n_dataset,
+                    index_t top_k,
+                    std::vector<dist_t>& out_values_h,
+                    std::vector<index_t>& out_indices_h,
+                    bool select_min = true)
+  {
+    auto comp = [select_min](const std::pair<dist_t, index_t>& a,
+                             const std::pair<dist_t, index_t>& b) {
+      return select_min ? a.first < b.first : a.first >= b.first;
+    };
+
+    for (index_t row = 0; row < n_queries; ++row) {
+      std::priority_queue<std::pair<dist_t, index_t>,
+                          std::vector<std::pair<dist_t, index_t>>,
+                          decltype(comp)>
+        pq(comp);
+      for (index_t idx = indptr_h[row]; idx < indptr_h[row + 1]; ++idx) {
+        pq.push({values_h[idx], (in_idx_h.has_value()) ? (*in_idx_h)[idx] : indices_h[idx]});
+        if (pq.size() > size_t(top_k)) { pq.pop(); }
+      }
+
+      std::vector<std::pair<dist_t, index_t>> row_pairs;
+      while (!pq.empty()) {
+        row_pairs.push_back(pq.top());
+        pq.pop();
+      }
+
+      if (select_min) {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first <= b.first;
+        });
+      } else {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first >= b.first;
+        });
+      }
+      for (index_t col = 0; col < top_k; col++) {
+        if (col < index_t(row_pairs.size())) {
+          out_values_h[row * top_k + col]  = row_pairs[col].first;
+          out_indices_h[row * top_k + col] = row_pairs[col].second;
+        }
+      }
+    }
+  }
+
+  void SetUp() override
+  {
+    if (std::is_same_v<value_t, half> && !isCuSparseVersionGreaterThan_12_0_1()) {
+      GTEST_SKIP() << "Skipping all tests for half-float as cuSparse doesn't support it.";
+    }
+    index_t element = raft::ceildiv(1 * params.n_dataset, index_t(sizeof(bitset_t) * 8));
+    std::vector<bitset_t> filter_h(element);
+    std::vector<bitset_t> filter_repeat_h(element * params.n_queries);
+
+    filter_d.resize(element, stream);
+
+    nnz = create_sparse_matrix_with_rmat(1, params.n_dataset, params.sparsity, filter_d);
+    raft::update_host(filter_h.data(), filter_d.data(), filter_d.size(), stream);
+    raft::resource::sync_stream(handle, stream);
+
+    repeat_cpu_bitset(
+      filter_h, size_t(params.n_dataset), size_t(params.n_queries), filter_repeat_h);
+    nnz *= params.n_queries;
+
+    index_t dataset_size = params.n_dataset * params.dim;
+    index_t queries_size = params.n_queries * params.dim;
+
+    std::vector<dist_t> dataset_h(dataset_size);
+    std::vector<dist_t> queries_h(queries_size);
+
+    dataset_d.resize(dataset_size, stream);
+    queries_d.resize(queries_size, stream);
+
+    auto blobs_in_val =
+      raft::make_device_matrix<dist_t, index_t>(handle, 1, dataset_size + queries_size);
+    auto labels = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+    if constexpr (!std::is_same_v<value_t, half>) {
+      raft::random::make_blobs<value_t, index_t>(blobs_in_val.data_handle(),
+                                                 labels.data_handle(),
+                                                 1,
+                                                 dataset_size + queries_size,
+                                                 1,
+                                                 stream,
+                                                 false,
+                                                 nullptr,
+                                                 nullptr,
+                                                 value_t(1.0),
+                                                 false,
+                                                 value_t(-1.0f),
+                                                 value_t(1.0f),
+                                                 uint64_t(2024));
+    } else {
+      raft::random::make_blobs<dist_t, index_t>(blobs_in_val.data_handle(),
+                                                labels.data_handle(),
+                                                1,
+                                                dataset_size + queries_size,
+                                                1,
+                                                stream,
+                                                false,
+                                                nullptr,
+                                                nullptr,
+                                                dist_t(1.0),
+                                                false,
+                                                dist_t(-1.0f),
+                                                dist_t(1.0f),
+                                                uint64_t(2024));
+    }
+
+    raft::copy(dataset_h.data(), blobs_in_val.data_handle(), dataset_size, stream);
+
+    if constexpr (std::is_same_v<value_t, half>) {
+      thrust::device_ptr<dist_t> d_output_ptr =
+        thrust::device_pointer_cast(blobs_in_val.data_handle());
+      thrust::device_ptr<value_t> d_value_ptr = thrust::device_pointer_cast(dataset_d.data());
+      thrust::transform(thrust::cuda::par.on(stream),
+                        d_output_ptr,
+                        d_output_ptr + dataset_size,
+                        d_value_ptr,
+                        float_to_half());
+    } else {
+      raft::copy(dataset_d.data(), blobs_in_val.data_handle(), dataset_size, stream);
+    }
+
+    raft::copy(queries_h.data(), blobs_in_val.data_handle() + dataset_size, queries_size, stream);
+    if constexpr (std::is_same_v<value_t, half>) {
+      thrust::device_ptr<dist_t> d_output_ptr =
+        thrust::device_pointer_cast(blobs_in_val.data_handle() + dataset_size);
+      thrust::device_ptr<value_t> d_value_ptr = thrust::device_pointer_cast(queries_d.data());
+      thrust::transform(thrust::cuda::par.on(stream),
+                        d_output_ptr,
+                        d_output_ptr + queries_size,
+                        d_value_ptr,
+                        float_to_half());
+    } else {
+      raft::copy(queries_d.data(), blobs_in_val.data_handle() + dataset_size, queries_size, stream);
+    }
+
+    raft::resource::sync_stream(handle);
+
+    std::vector<dist_t> values_h(nnz);
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_queries + 1);
+
+    cpu_convert_to_csr(filter_repeat_h, params.n_queries, params.n_dataset, indices_h, indptr_h);
+
+    cpu_sddmm(queries_h, dataset_h, values_h, indices_h, indptr_h, true, false);
+
+    bool select_min = cuvs::distance::is_min_close(params.metric);
+
+    std::vector<dist_t> out_val_h(
+      params.n_queries * params.top_k,
+      select_min ? std::numeric_limits<dist_t>::infinity() : std::numeric_limits<dist_t>::lowest());
+    std::vector<index_t> out_idx_h(params.n_queries * params.top_k, static_cast<index_t>(0));
+
+    out_val_d.resize(params.n_queries * params.top_k, stream);
+    out_idx_d.resize(params.n_queries * params.top_k, stream);
+
+    raft::update_device(out_val_d.data(), out_val_h.data(), out_val_h.size(), stream);
+    raft::update_device(out_idx_d.data(), out_idx_h.data(), out_idx_h.size(), stream);
+
+    raft::resource::sync_stream(handle);
+
+    std::optional<std::vector<index_t>> optional_indices_h = std::nullopt;
+    cpu_select_k(indptr_h,
+                 indices_h,
+                 values_h,
+                 optional_indices_h,
+                 params.n_queries,
+                 params.n_dataset,
+                 params.top_k,
+                 out_val_h,
+                 out_idx_h,
+                 select_min);
+    out_val_expected_d.resize(params.n_queries * params.top_k, stream);
+    out_idx_expected_d.resize(params.n_queries * params.top_k, stream);
+
+    raft::update_device(out_val_expected_d.data(), out_val_h.data(), out_val_h.size(), stream);
+    raft::update_device(out_idx_expected_d.data(), out_idx_h.data(), out_idx_h.size(), stream);
+
+    raft::resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto dataset_raw = raft::make_device_matrix_view<const value_t, index_t, raft::row_major>(
+      (const value_t*)dataset_d.data(), params.n_dataset, params.dim);
+
+    auto queries = raft::make_device_matrix_view<const value_t, index_t, raft::row_major>(
+      (const value_t*)queries_d.data(), params.n_queries, params.dim);
+
+    auto dataset = brute_force::build(handle, dataset_raw, params.metric);
+
+    auto filter =
+      cuvs::core::bitset_view<bitset_t, index_t>((bitset_t*)filter_d.data(), params.n_dataset);
+
+    auto out_val = raft::make_device_matrix_view<dist_t, index_t, raft::row_major>(
+      out_val_d.data(), params.n_queries, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      out_idx_d.data(), params.n_queries, params.top_k);
+
+    brute_force::search(handle,
+                        dataset,
+                        queries,
+                        out_idx,
+                        out_val,
+                        cuvs::neighbors::filtering::bitset_filter(filter));
+    std::vector<dist_t> out_val_h(params.n_queries * params.top_k,
+                                  std::numeric_limits<dist_t>::infinity());
+
+    raft::update_host(out_val_h.data(), out_val_d.data(), out_val_h.size(), stream);
+    raft::resource::sync_stream(handle);
+
+    ASSERT_TRUE(cuvs::neighbors::devArrMatchKnnPair(out_idx_expected_d.data(),
+                                                    out_idx.data_handle(),
+                                                    out_val_expected_d.data(),
+                                                    out_val.data_handle(),
+                                                    params.n_queries,
+                                                    params.top_k,
+                                                    0.001f,
+                                                    stream,
+                                                    true));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  PrefilteredBruteForceInputs<index_t> params;
+
+  index_t nnz;
+
+  rmm::device_uvector<value_t> dataset_d;
+  rmm::device_uvector<value_t> queries_d;
+  rmm::device_uvector<bitset_t> filter_d;
+
+  rmm::device_uvector<dist_t> out_val_d;
+  rmm::device_uvector<dist_t> out_val_expected_d;
+
+  rmm::device_uvector<index_t> out_idx_d;
+  rmm::device_uvector<index_t> out_idx_expected_d;
+};
+
+using PrefilteredBruteForceTestOnBitmap_float_int64 =
+  PrefilteredBruteForceOnBitmapTest<float, float, int64_t>;
+TEST_P(PrefilteredBruteForceTestOnBitmap_float_int64, Result) { Run(); }
+
+using PrefilteredBruteForceTestOnBitmap_half_int64 =
+  PrefilteredBruteForceOnBitmapTest<half, float, int64_t>;
+TEST_P(PrefilteredBruteForceTestOnBitmap_half_int64, Result) { Run(); }
+
+using PrefilteredBruteForceTestOnBitset_float_int64 =
+  PrefilteredBruteForceOnBitsetTest<float, float, int64_t>;
+TEST_P(PrefilteredBruteForceTestOnBitset_float_int64, Result) { Run(); }
+
+using PrefilteredBruteForceTestOnBitset_half_int64 =
+  PrefilteredBruteForceOnBitsetTest<half, float, int64_t>;
+TEST_P(PrefilteredBruteForceTestOnBitset_half_int64, Result) { Run(); }
 
 template <typename index_t>
 const std::vector<PrefilteredBruteForceInputs<index_t>> selectk_inputs = {
@@ -570,7 +1028,7 @@ const std::vector<PrefilteredBruteForceInputs<index_t>> selectk_inputs = {
   {1024, 8192, 5, 0, 0.1, cuvs::distance::DistanceType::L2SqrtExpanded},
   {1024, 8192, 8, 0, 0.1, cuvs::distance::DistanceType::CosineExpanded},
 
-  {1024, 8192, 1, 1, 0.1, cuvs::distance::DistanceType::L2Expanded},  //--
+  {1024, 8192, 1, 1, 0.1, cuvs::distance::DistanceType::L2Expanded},
   {1024, 8192, 3, 1, 0.1, cuvs::distance::DistanceType::InnerProduct},
   {1024, 8192, 5, 1, 0.1, cuvs::distance::DistanceType::L2SqrtExpanded},
   {1024, 8192, 8, 1, 0.1, cuvs::distance::DistanceType::CosineExpanded},
@@ -599,12 +1057,20 @@ const std::vector<PrefilteredBruteForceInputs<index_t>> selectk_inputs = {
   {1024, 8192, 5, 16, 0.5, cuvs::distance::DistanceType::CosineExpanded},
   {1024, 8192, 8, 16, 0.2, cuvs::distance::DistanceType::CosineExpanded}};
 
-INSTANTIATE_TEST_CASE_P(PrefilteredBruteForceTest,
-                        PrefilteredBruteForceTest_float_int64,
+INSTANTIATE_TEST_CASE_P(PrefilteredBruteForceOnBitmapTest,
+                        PrefilteredBruteForceTestOnBitmap_float_int64,
+                        ::testing::ValuesIn(selectk_inputs<int64_t>));
+
+INSTANTIATE_TEST_CASE_P(PrefilteredBruteForceOnBitmapTest,
+                        PrefilteredBruteForceTestOnBitmap_half_int64,
+                        ::testing::ValuesIn(selectk_inputs<int64_t>));
+
+INSTANTIATE_TEST_CASE_P(PrefilteredBruteForceOnBitsetTest,
+                        PrefilteredBruteForceTestOnBitset_float_int64,
                         ::testing::ValuesIn(selectk_inputs<int64_t>));
 
-INSTANTIATE_TEST_CASE_P(PrefilteredBruteForceTest,
-                        PrefilteredBruteForceTest_half_int64,
+INSTANTIATE_TEST_CASE_P(PrefilteredBruteForceOnBitsetTest,
+                        PrefilteredBruteForceTestOnBitset_half_int64,
                         ::testing::ValuesIn(selectk_inputs<int64_t>));
 
 }  // namespace cuvs::neighbors::brute_force
diff --git a/python/cuvs/cuvs/tests/test_brute_force.py b/python/cuvs/cuvs/tests/test_brute_force.py
index 0b37ad885..a234794f9 100644
--- a/python/cuvs/cuvs/tests/test_brute_force.py
+++ b/python/cuvs/cuvs/tests/test_brute_force.py
@@ -134,7 +134,7 @@ def test_prefiltered_brute_force_knn(
     index = np.random.random_sample((n_index_rows, n_cols)).astype(dtype)
     queries = np.random.random_sample((n_query_rows, n_cols)).astype(dtype)
     bitmap = create_sparse_array(
-        (np.ceil(n_query_rows * n_index_rows / 32).astype(int)), sparsity
+        (np.ceil(n_query_rows * n_index_rows / 32).astype(np.uint32)), sparsity
     )
 
     is_min = metric != "inner_product"

From 8eca5247782abca479543ec69b1d479beb6009a6 Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Fri, 31 Jan 2025 05:52:55 +0100
Subject: [PATCH 4/4] Improve filtering documentation (#568)

This PR add a dedicated documentation page for filtering in the `Getting started` tab, and add the `cuvs::neighbors::filtering` namespace to the C++ documentation

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/568
---
 cpp/include/cuvs/neighbors/common.hpp        |   7 ++
 cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh    |   2 +-
 docs/source/cpp_api/neighbors.rst            |   1 +
 docs/source/cpp_api/neighbors_bruteforce.rst |   2 +-
 docs/source/cpp_api/neighbors_filter.rst     |  18 +++
 docs/source/filtering.rst                    | 116 +++++++++++++++++++
 docs/source/getting_started.rst              |   2 +
 docs/source/indexes/bruteforce.rst           |   6 +-
 docs/source/indexes/cagra.rst                |  24 ++--
 docs/source/indexes/ivfflat.rst              |   4 +-
 docs/source/indexes/ivfpq.rst                |  12 +-
 examples/cpp/CMakeLists.txt                  |   4 +
 examples/cpp/src/brute_force_bitmap.cu       |  84 ++++++++++++++
 13 files changed, 259 insertions(+), 23 deletions(-)
 create mode 100644 docs/source/cpp_api/neighbors_filter.rst
 create mode 100644 docs/source/filtering.rst
 create mode 100644 examples/cpp/src/brute_force_bitmap.cu

diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index 5dc99a4e8..038b6b1da 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -457,6 +457,11 @@ inline constexpr bool is_vpq_dataset_v = is_vpq_dataset<DatasetT>::value;
 
 namespace filtering {
 
+/**
+ * @defgroup neighbors_filtering Filtering for ANN Types
+ * @{
+ */
+
 enum class FilterType { None, Bitmap, Bitset };
 
 struct base_filter {
@@ -567,6 +572,8 @@ struct bitset_filter : public base_filter {
   void to_csr(raft::resources const& handle, csr_matrix_t& csr);
 };
 
+/** @} */  // end group neighbors_filtering
+
 /**
  * If the filtering depends on the index of a sample, then the following
  * filter template can be used:
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
index 44a1b11fa..0e492da6c 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
@@ -1696,7 +1696,7 @@ auto build(raft::resources const& handle,
                 "Unsupported data type");
 
   std::cout << "using ivf_pq::index_params nrows " << (int)dataset.extent(0) << ", dim "
-            << (int)dataset.extent(1) << ", n_lits " << (int)params.n_lists << ", pq_dim "
+            << (int)dataset.extent(1) << ", n_lists " << (int)params.n_lists << ", pq_dim "
             << (int)params.pq_dim << std::endl;
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
   RAFT_EXPECTS(n_rows >= params.n_lists, "number of rows can't be less than n_lists");
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index 95359558f..ff1566e6c 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -12,6 +12,7 @@ Nearest Neighbors
    neighbors_bruteforce.rst
    neighbors_cagra.rst
    neighbors_dynamic_batching.rst
+   neighbors_filter.rst
    neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
diff --git a/docs/source/cpp_api/neighbors_bruteforce.rst b/docs/source/cpp_api/neighbors_bruteforce.rst
index f75e26b3c..1a3f2f715 100644
--- a/docs/source/cpp_api/neighbors_bruteforce.rst
+++ b/docs/source/cpp_api/neighbors_bruteforce.rst
@@ -7,7 +7,7 @@ The bruteforce method is running the KNN algorithm. It performs an extensive sea
    :language: c++
    :class: highlight
 
-``#include <cuvs/neighbors/bruteforce.hpp>``
+``#include <cuvs/neighbors/brute_force.hpp>``
 
 namespace *cuvs::neighbors::bruteforce*
 
diff --git a/docs/source/cpp_api/neighbors_filter.rst b/docs/source/cpp_api/neighbors_filter.rst
new file mode 100644
index 000000000..aba1d348f
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_filter.rst
@@ -0,0 +1,18 @@
+Filtering
+==========
+
+All nearest neighbors search methods support filtering. Filtering is a method to reduce the number
+of candidates that are considered for the nearest neighbors search.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <cuvs/neighbors/common.hpp>``
+
+namespace *cuvs::neighbors*
+
+.. doxygengroup:: neighbors_filtering
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/filtering.rst b/docs/source/filtering.rst
new file mode 100644
index 000000000..35805c5de
--- /dev/null
+++ b/docs/source/filtering.rst
@@ -0,0 +1,116 @@
+.. _filtering:
+
+~~~~~~~~~~~~~~~~~~~~~~~~
+Filtering vector indexes
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+cuVS supports different type of filtering depending on the vector index being used. The main method used in all of the vector indexes
+is pre-filtering, which is a technique that will into account the filtering of the vectors before computing it's closest neighbors, saving
+some computation from calculating distances.
+
+Bitset
+======
+
+A bitset is an array of bits where each bit can have two possible values: `0` and `1`, which signify in the context of filtering whether
+a sample should be filtered or not. `0` means that the corresponding vector will be filtered, and will therefore not be present in the results of the search.
+This mechanism is optimized to take as little memory space as possible, and is available through the RAFT library
+(check out RAFT's `bitset API documentation <https://docs.rapids.ai/api/raft/stable/cpp_api/core_bitset/>`). When calling a search function of an ANN index, the
+bitset length should match the number of vectors present in the database.
+
+Bitmap
+======
+
+A bitmap is based on the same principle as a bitset, but in two dimensions. This allows users to provide a different bitset for each query
+being searched. Check out RAFT's `bitmap API documentation <https://docs.rapids.ai/api/raft/stable/cpp_api/core_bitmap/>`.
+
+Examples
+=======
+
+Using a Bitset filter on a CAGRA index
+--------------------------------------
+
+.. code-block:: c++
+
+    #include <cuvs/neighbors/cagra.hpp>
+    #include <cuvs/core/bitset.hpp>
+
+    using namespace cuvs::neighbors;
+    cagra::index index;
+
+    // ... build index ...
+
+    cagra::search_params search_params;
+    raft::device_resources res;
+    raft::device_matrix_view<float> queries = load_queries();
+    raft::device_matrix_view<uint32_t> neighbors = make_device_matrix_view<uint32_t>(n_queries, k);
+    raft::device_matrix_view<float> distances = make_device_matrix_view<float>(n_queries, k);
+
+    // Load a list of all the samples that will get filtered
+    std::vector<uint32_t> removed_indices_host = get_invalid_indices();
+    auto removed_indices_device =
+          raft::make_device_vector<uint32_t, uint32_t>(res, removed_indices_host.size());
+    // Copy this list to device
+    raft::copy(removed_indices_device.data_handle(), removed_indices_host.data(),
+               removed_indices_host.size(), raft::resource::get_cuda_stream(res));
+
+    // Create a bitset with the list of samples to filter.
+    cuvs::core::bitset<uint32_t, uint32_t> removed_indices_bitset(
+        res, removed_indices_device.view(), index.size());
+    // Use a `bitset_filter` in the `cagra::search` function call.
+    auto bitset_filter =
+          cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset.view());
+    cagra::search(res,
+                  search_params,
+                  index,
+                  queries,
+                  neighbors,
+                  distances,
+                  bitset_filter);
+
+
+Using a Bitmap filter on a Brute-force index
+--------------------------------------------
+
+.. code-block:: c++
+
+    #include <cuvs/neighbors/brute_force.hpp>
+    #include <cuvs/core/bitmap.hpp>
+
+    using namespace cuvs::neighbors;
+    using indexing_dtype = int64_t;
+
+    // ... build index ...
+    brute_force::index_params index_params;
+    brute_force::search_params search_params;
+    raft::device_resources res;
+    raft::device_matrix_view<float, indexing_dtype> dataset = load_dataset(n_vectors, dim);
+    raft::device_matrix_view<float, indexing_dtype> queries = load_queries(n_queries, dim);
+    auto index = brute_force::build(res, index_params, raft::make_const_mdspan(dataset.view()));
+
+    // Load a list of all the samples that will get filtered
+    std::vector<uint32_t> removed_indices_host = get_invalid_indices();
+    auto removed_indices_device =
+          raft::make_device_vector<uint32_t, uint32_t>(res, removed_indices_host.size());
+    // Copy this list to device
+    raft::copy(removed_indices_device.data_handle(), removed_indices_host.data(),
+               removed_indices_host.size(), raft::resource::get_cuda_stream(res));
+
+    // Create a bitmap with the list of samples to filter.
+    cuvs::core::bitset<uint32_t, indexing_dtype> removed_indices_bitset(
+      res, removed_indices_device.view(), n_queries * n_vectors);
+    cuvs::core::bitmap_view<const uint32_t, indexing_dtype> removed_indices_bitmap(
+        removed_indices_bitset.data(), n_queries, n_vectors);
+
+    // Use a `bitmap_filter` in the `brute_force::search` function call.
+    auto bitmap_filter =
+          cuvs::neighbors::filtering::bitmap_filter(removed_indices_bitmap);
+
+    auto neighbors = raft::make_device_matrix_view<uint32_t, indexing_dtype>(n_queries, k);
+    auto distances = raft::make_device_matrix_view<float, indexing_dtype>(n_queries, k);
+    brute_force::search(res,
+                        search_params,
+                        index,
+                        raft::make_const_mdspan(queries.view()),
+                        neighbors.view(),
+                        distances.view(),
+                        bitmap_filter);
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index b9cfdaca2..c4706e510 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -118,3 +118,5 @@ We always welcome patches for new features and bug fixes. Please read our `contr
    indexes/indexes.rst
    api_basics.rst
    api_interoperability.rst
+   working_with_ann_indexes.rst
+   filtering.rst
diff --git a/docs/source/indexes/bruteforce.rst b/docs/source/indexes/bruteforce.rst
index 0bd17dbf1..3dc115507 100644
--- a/docs/source/indexes/bruteforce.rst
+++ b/docs/source/indexes/bruteforce.rst
@@ -12,7 +12,7 @@ Brute-force can also be a good choice for heavily filtered queries where other a
 when filtering out 90%-95% of the vectors from a search, the IVF methods could struggle to return anything at all with smaller number of probes and
 graph-based algorithms with limited hash table memory could end up skipping over important unfiltered entries.
 
-[ :doc:`C API <../c_api/neighbors_bruteforce_c>` | :doc:`C++ API <../cpp_api/neighbors_bruteforce>` | :doc:`Python API <../python_api/neighbors_bruteforce>` | :doc:`Rust API <../rust_api/index>` ]
+[ :doc:`C API <../c_api/neighbors_bruteforce_c>` | :doc:`C++ API <../cpp_api/neighbors_bruteforce>` | :doc:`Python API <../python_api/neighbors_brute_force>` | :doc:`Rust API <../rust_api/index>` ]
 
 Filtering considerations
 ------------------------
@@ -57,6 +57,6 @@ Memory footprint
 Index footprint
 ~~~~~~~~~~~~~~~
 
-Raw vectors: :math:`n_vectors * n_dimensions * precision`
+Raw vectors: :math:`n\_vectors * n\_dimensions * precision`
 
-Vector norms (for distances which require them): :math:`n_vectors * precision`
+Vector norms (for distances which require them): :math:`n\_vectors * precision`
diff --git a/docs/source/indexes/cagra.rst b/docs/source/indexes/cagra.rst
index 551eca8f7..14d6c6502 100644
--- a/docs/source/indexes/cagra.rst
+++ b/docs/source/indexes/cagra.rst
@@ -108,14 +108,14 @@ IVFPQ or NN-DESCENT can be used to build the graph (additions to the peak memory
 Dataset on device (graph on host):
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Index memory footprint (device): :math:`n_index_vectors * n_dims * sizeof(T)`
+Index memory footprint (device): :math:`n\_index\_vectors * n\_dims * sizeof(T)`
 
-Index memory footprint (host): :math:`graph_degree * n_index_vectors * sizeof(T)``
+Index memory footprint (host): :math:`graph\_degree * n\_index\_vectors * sizeof(T)``
 
 Dataset on host (graph on host):
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Index memory footprint (host): :math:`n_index_vectors * n_dims * sizeof(T) + graph_degree * n_index_vectors * sizeof(T)`
+Index memory footprint (host): :math:`n\_index\_vectors * n\_dims * sizeof(T) + graph\_degree * n\_index\_vectors * sizeof(T)`
 
 Build peak memory usage:
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -123,7 +123,7 @@ Build peak memory usage:
 When built using NN-descent / IVF-PQ, the build process consists of two phases: (1) building an initial/(intermediate) graph and then (2) optimizing the graph. Key input parameters are n_vectors, intermediate_graph_degree, graph_degree.
 The memory usage in the first phase (building) depends on the chosen method. The biggest allocation is the graph (n_vectors*intermediate_graph_degree), but it’s stored in the host memory.
 Usually, the second phase (optimize) uses the most device memory. The peak memory usage is achieved during the pruning step (graph_core.cuh/optimize)
-Optimize: formula for peak memory usage (device): :math:`n_vectors * (4 + (sizeof(IdxT) + 1) * intermediate_degree)``
+Optimize: formula for peak memory usage (device): :math:`n\_vectors * (4 + (sizeof(IdxT) + 1) * intermediate_degree)``
 
 Build with out-of-core IVF-PQ peak memory usage:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -134,14 +134,18 @@ IVF-PQ Build:
 
 .. math::
 
-   n_vectors / train_set_ratio * dim * sizeof(float)   // trainset, may be in managed mem
-   + n_vectors / train_set_ratio * sizeof(uint32_t)    // labels, may be in managed mem
-   + n_clusters * n_dim * sizeof(float)                // cluster centers
+   n\_vectors / train\_set\_ratio * dim * sizeof_{float}   // trainset, may be in managed mem
+
+   + n\_vectors / train\_set\_ratio * sizeof(uint32_t)    // labels, may be in managed mem
+
+   + n\_clusters * n\_dim * sizeof_{float}                // cluster centers
 
 IVF-PQ Search (max batch size 1024 vectors on device at a time):
 
 .. math::
 
-   [n_vectors * (pq_dim * pq_bits / 8 + sizeof(int64_t)) + O(n_clusters)]
-   + [batch_size * n_dim * sizeof(float)] + [batch_size * intermediate_degree * sizeof(uint32_t)] +
-   [batch_size * intermediate_degree * sizeof(float)]
+   [n\_vectors * (pq\_dim * pq\_bits / 8 + sizeof_{int64\_t}) + O(n\_clusters)]
+
+   + [batch\_size * n\_dim * sizeof_{float}] + [batch\_size * intermediate\_degree * sizeof_{uint32\_t}]
+
+   + [batch\_size * intermediate\_degree * sizeof_{float}]
diff --git a/docs/source/indexes/ivfflat.rst b/docs/source/indexes/ivfflat.rst
index 89a9fb6e6..7154db037 100644
--- a/docs/source/indexes/ivfflat.rst
+++ b/docs/source/indexes/ivfflat.rst
@@ -86,7 +86,7 @@ Memory footprint
 ----------------
 
 Each cluster is padded to at least 32 vectors (but potentially up to 1024). Assuming uniform random distribution of vectors/list, we would have
-:math:`cluster\_overhead = (conservative\_memory\_allocation ? 16 : 512 ) * dim * sizeof_{float})`
+:math:`cluster\_overhead = (conservative\_memory\_allocation ? 16 : 512 ) * dim * sizeof_{float}`
 
 Note that each cluster is allocated as a separate allocation. If we use a `cuda_memory_resource`, that would grab memory in 1 MiB chunks, so on average we might have 0.5 MiB overhead per cluster. If we us 10s of thousands of clusters, it becomes essential to use pool allocator to avoid this overhead.
 
@@ -110,6 +110,6 @@ Index (device memory):
 Peak device memory usage for index build:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:math:`workspace = min(1GB, n\_queries * [(n\_lists + 1 + n\_probes * (k + 1)) * sizeof_{float}) + n\_probes * k * sizeof_{idx}])`
+:math:`workspace = min(1GB, n\_queries * [(n\_lists + 1 + n\_probes * (k + 1)) * sizeof_{float} + n\_probes * k * sizeof_{idx}])`
 
 :math:`index\_size + workspace`
diff --git a/docs/source/indexes/ivfpq.rst b/docs/source/indexes/ivfpq.rst
index 0474452a0..ad973adf2 100644
--- a/docs/source/indexes/ivfpq.rst
+++ b/docs/source/indexes/ivfpq.rst
@@ -97,22 +97,22 @@ Simple approximate formula: :math:`n\_vectors * (pq\_dim * \frac{pq\_bits}{8} +
 
 The IVF lists end up being represented by a sparse data structure that stores the pointers to each list, an indices array that contains the indexes of each vector in each list, and an array with the encoded (and interleaved) data for each list.
 
-IVF list pointers: :math:`n\_clusters * sizeof_{uint32_t}`
+IVF list pointers: :math:`n\_clusters * sizeof_{uint32\_t}`
 
-Indices: :math:`n\_vectors * sizeof_{idx}``
+Indices: :math:`n\_vectors * sizeof_{idx}`
 
 Encoded data (interleaved): :math:`n\_vectors * pq\_dim * \frac{pq\_bits}{8}`
 
-Per subspace method: :math:`4 * pq\_dim * pq\_len * 2^pq\_bits`
+Per subspace method: :math:`4 * pq\_dim * pq\_len * 2^{pq\_bits}`
 
-Per cluster method: :math:`4 * n\_clusters * pq\_len * 2^pq\_bits`
+Per cluster method: :math:`4 * n\_clusters * pq\_len * 2^{pq\_bits}`
 
 Extras: :math:`n\_clusters * (20 + 8 * dim)`
 
 Index (host memory):
 ~~~~~~~~~~~~~~~~~~~~
 
-When refinement is used with the dataset on host, the original raw vectors are needed: :math:`n\_vectors * dims * sizeof_{Tloat}`
+When refinement is used with the dataset on host, the original raw vectors are needed: :math:`n\_vectors * dims * sizeof_{float}`
 
 Search peak memory usage (device);
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -128,7 +128,7 @@ Build peak memory usage (device):
 
    \frac{n\_vectors}{trainset\_ratio * dims * sizeof_{float}}
 
-   + \frac{n\_vectors}{trainset\_ratio * sizeof_{uint32_t}}
+   + \frac{n\_vectors}{trainset\_ratio * sizeof_{uint32\_t}}
 
    + n\_clusters * dim * sizeof_{float}
 
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index b0d0ae9ee..6bf8f3408 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -36,6 +36,7 @@ set(BUILD_CUVS_C_LIBRARY OFF)
 include(../cmake/thirdparty/get_cuvs.cmake)
 
 # -------------- compile tasks ----------------- #
+add_executable(BRUTE_FORCE_EXAMPLE src/brute_force_bitmap.cu)
 add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
 add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu)
 add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu)
@@ -48,6 +49,9 @@ add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
 add_library(rmm_logger OBJECT)
 target_link_libraries(rmm_logger PRIVATE rmm::rmm_logger_impl)
 
+target_link_libraries(
+  BRUTE_FORCE_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
+)
 target_link_libraries(
   CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
 )
diff --git a/examples/cpp/src/brute_force_bitmap.cu b/examples/cpp/src/brute_force_bitmap.cu
new file mode 100644
index 000000000..69e4df60b
--- /dev/null
+++ b/examples/cpp/src/brute_force_bitmap.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/core/bitmap.hpp>
+#include <cuvs/neighbors/brute_force.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/make_blobs.cuh>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <iostream>
+
+void load_dataset(const raft::device_resources& res, float* data_ptr, int n_vectors, int dim)
+{
+  raft::random::RngState rng(1234ULL);
+  raft::random::uniform(
+    res, rng, data_ptr, n_vectors * dim, 0.1f, 2.0f);
+}
+
+int main()
+{
+  using namespace cuvs::neighbors;
+  using dataset_dtype = float;
+  using indexing_dtype = int64_t;
+  auto dim = 128;
+  auto n_vectors = 90;
+  auto n_queries = 100;
+  auto k = 5;
+
+  // ... build index ...
+  raft::device_resources res;
+  brute_force::index_params index_params;
+  brute_force::search_params search_params;
+  auto dataset = raft::make_device_matrix<dataset_dtype, indexing_dtype>(res, n_vectors, dim);
+  auto queries = raft::make_device_matrix<dataset_dtype, indexing_dtype>(res, n_queries, dim);
+
+  load_dataset(res, dataset.data_handle(), n_vectors, dim);
+  load_dataset(res, queries.data_handle(), n_queries, dim);
+  auto index = brute_force::build(res, index_params, raft::make_const_mdspan(dataset.view()));
+
+  // Load a list of all the samples that will get filtered
+  std::vector<indexing_dtype> removed_indices_host = {2, 13, 21, 8};
+  auto removed_indices_device =
+        raft::make_device_vector<indexing_dtype, indexing_dtype>(res, removed_indices_host.size());
+  // Copy this list to device
+  raft::copy(removed_indices_device.data_handle(), removed_indices_host.data(),
+             removed_indices_host.size(), raft::resource::get_cuda_stream(res));
+
+  // Create a bitmap with the list of samples to filter.
+  cuvs::core::bitset<uint32_t, indexing_dtype> removed_indices_bitset(
+      res, removed_indices_device.view(), n_queries * n_vectors);
+  cuvs::core::bitmap_view<const uint32_t, indexing_dtype> removed_indices_bitmap(
+      removed_indices_bitset.data(), n_queries, n_vectors);
+
+  // Use a `bitmap_filter` in the `brute_force::search` function call.
+  auto bitmap_filter = cuvs::neighbors::filtering::bitmap_filter(removed_indices_bitmap);
+
+  auto neighbors = raft::make_device_matrix<indexing_dtype, indexing_dtype>(res, n_queries, k);
+  auto distances = raft::make_device_matrix<dataset_dtype, indexing_dtype>(res, n_queries, k);
+  std::cout << "Searching..." << std::endl;
+  brute_force::search(res,
+                      search_params,
+                      index,
+                      raft::make_const_mdspan(queries.view()),
+                      neighbors.view(),
+                      distances.view(),
+                      bitmap_filter);
+  std::cout << "Success!" << std::endl;
+  return 0;
+}