Skip to content

Commit

Permalink
Merge branch 'branch-25.02' into rhdong/cagra-merge
Browse files Browse the repository at this point in the history
  • Loading branch information
rhdong authored Jan 31, 2025
2 parents 7af3ad8 + 8eca524 commit c69af18
Show file tree
Hide file tree
Showing 50 changed files with 1,750 additions and 508 deletions.
165 changes: 140 additions & 25 deletions cpp/include/cuvs/neighbors/brute_force.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,15 +332,28 @@ auto build(raft::resources const& handle,
* Note, this function requires a temporary buffer to store intermediate results between cuda kernel
* calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
* pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
* eliminate entirely allocations happening within `search`:
* eliminate entirely allocations happening within `search`.
*
* Usage example:
* @code{.cpp}
* ...
* // Use the same allocator across multiple searches to reduce the number of
* // cuda memory allocations
* brute_force::search(handle, index, queries1, out_inds1, out_dists1);
* brute_force::search(handle, index, queries2, out_inds2, out_dists2);
* brute_force::search(handle, index, queries3, out_inds3, out_dists3);
* ...
* using namespace cuvs::neighbors;
*
* // use default index parameters
* brute_force::index_params index_params;
* // create and fill the index from a [N, D] dataset
* brute_force::index_params index_params;
* auto index = brute_force::build(handle, index_params, dataset);
* // use default search parameters
* brute_force::search_params search_params;
* // create a bitset to filter the search
* auto removed_indices = raft::make_device_vector<int64_t, int64_t>(res, n_removed_indices);
* raft::core::bitset<std::uint32_t, int64_t> removed_indices_bitset(
* res, removed_indices.view(), dataset.extent(0));
* // search K nearest neighbours according to a bitset
* auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
* auto distances = raft::make_device_matrix<float>(res, n_queries, k);
* auto filter = filtering::bitset_filter(removed_indices_bitset.view());
* brute_force::search(res, search_params, index, queries, neighbors, distances, filter);
* @endcode
*
* @param[in] handle
Expand All @@ -350,9 +363,17 @@ auto build(raft::resources const& handle,
* @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
* [n_queries, k]
* @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
* @param[in] sample_filter An optional device bitmap filter function with a `row-major` layout and
* the shape of [n_queries, index->size()], which means the filter will use the first
* `index->size()` bits to indicate whether queries[0] should compute the distance with dataset.
* @param[in] sample_filter An optional device filter that restricts which dataset elements should
* be considered for each query.
*
* - Supports two types of filters:
* 1. **Bitset Filter**: A shared filter where each bit corresponds to a dataset element.
* All queries share the same filter, with a logical shape of `[1, index->size()]`.
* 2. **Bitmap Filter**: A per-query filter with a logical shape of `[n_queries, index->size()]`,
* where each bit indicates whether a specific dataset element should be considered for a
* particular query. (1 for inclusion, 0 for exclusion).
*
* - The default value is `none_sample_filter`, which applies no filtering.
*/
void search(raft::resources const& handle,
const cuvs::neighbors::brute_force::search_params& params,
Expand All @@ -379,15 +400,28 @@ void search(raft::resources const& handle,
* Note, this function requires a temporary buffer to store intermediate results between cuda kernel
* calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
* pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
* eliminate entirely allocations happening within `search`:
* eliminate entirely allocations happening within `search`.
*
* Usage example:
* @code{.cpp}
* ...
* // Use the same allocator across multiple searches to reduce the number of
* // cuda memory allocations
* brute_force::search(handle, index, queries1, out_inds1, out_dists1);
* brute_force::search(handle, index, queries2, out_inds2, out_dists2);
* brute_force::search(handle, index, queries3, out_inds3, out_dists3);
* ...
* using namespace cuvs::neighbors;
*
* // use default index parameters
* brute_force::index_params index_params;
* // create and fill the index from a [N, D] dataset
* brute_force::index_params index_params;
* auto index = brute_force::build(handle, index_params, dataset);
* // use default search parameters
* brute_force::search_params search_params;
* // create a bitset to filter the search
* auto removed_indices = raft::make_device_vector<int64_t, int64_t>(res, n_removed_indices);
* raft::core::bitset<std::uint32_t, int64_t> removed_indices_bitset(
* res, removed_indices.view(), dataset.extent(0));
* // search K nearest neighbours according to a bitset
* auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
* auto distances = raft::make_device_matrix<half>(res, n_queries, k);
* auto filter = filtering::bitset_filter(removed_indices_bitset.view());
* brute_force::search(res, search_params, index, queries, neighbors, distances, filter);
* @endcode
*
* @param[in] handle
Expand All @@ -397,8 +431,17 @@ void search(raft::resources const& handle,
* @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
* [n_queries, k]
* @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
* @param[in] sample_filter a optional device bitmap filter function that greenlights samples for a
* given
* @param[in] sample_filter An optional device filter that restricts which dataset elements should
* be considered for each query.
*
* - Supports two types of filters:
* 1. **Bitset Filter**: A shared filter where each bit corresponds to a dataset element.
* All queries share the same filter, with a logical shape of `[1, index->size()]`.
* 2. **Bitmap Filter**: A per-query filter with a logical shape of `[n_queries, index->size()]`,
* where each bit indicates whether a specific dataset element should be considered for a
* particular query. (1 for inclusion, 0 for exclusion).
*
* - The default value is `none_sample_filter`, which applies no filtering.
*/
void search(raft::resources const& handle,
const cuvs::neighbors::brute_force::search_params& params,
Expand All @@ -421,15 +464,51 @@ void search(raft::resources const& handle,
*
* See the [brute_force::build](#brute_force::build) documentation for a usage example.
*
* Note, this function requires a temporary buffer to store intermediate results between cuda kernel
* calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
* pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
* eliminate entirely allocations happening within `search`.
*
* Usage example:
* @code{.cpp}
* using namespace cuvs::neighbors;
*
* // use default index parameters
* brute_force::index_params index_params;
* // create and fill the index from a [N, D] dataset
* brute_force::index_params index_params;
* auto index = brute_force::build(handle, index_params, dataset);
* // use default search parameters
* brute_force::search_params search_params;
* // create a bitset to filter the search
* auto removed_indices = raft::make_device_vector<int64_t, int64_t>(res, n_removed_indices);
* raft::core::bitset<std::uint32_t, int64_t> removed_indices_bitset(
* res, removed_indices.view(), dataset.extent(0));
* // search K nearest neighbours according to a bitset
* auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
* auto distances = raft::make_device_matrix<float>(res, n_queries, k);
* auto filter = filtering::bitset_filter(removed_indices_bitset.view());
* brute_force::search(res, search_params, index, queries, neighbors, distances, filter);
* @endcode
*
* @param[in] handle
* @param[in] params parameters configuring the search
* @param[in] index bruteforce constructed index
* @param[in] queries a device pointer to a col-major matrix [n_queries, index->dim()]
* @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
* [n_queries, k]
* @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
* @param[in] sample_filter an optional device bitmap filter function that greenlights samples for a
* given query
* @param[in] sample_filter An optional device filter that restricts which dataset elements should
* be considered for each query.
*
* - Supports two types of filters:
* 1. **Bitset Filter**: A shared filter where each bit corresponds to a dataset element.
* All queries share the same filter, with a logical shape of `[1, index->size()]`.
* 2. **Bitmap Filter**: A per-query filter with a logical shape of `[n_queries, index->size()]`,
* where each bit indicates whether a specific dataset element should be considered for a
* particular query. (1 for inclusion, 0 for exclusion).
*
* - The default value is `none_sample_filter`, which applies no filtering.
*/
void search(raft::resources const& handle,
const cuvs::neighbors::brute_force::search_params& params,
Expand All @@ -452,15 +531,51 @@ void search(raft::resources const& handle,
*
* See the [brute_force::build](#brute_force::build) documentation for a usage example.
*
* Note, this function requires a temporary buffer to store intermediate results between cuda kernel
* calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
* pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
* eliminate entirely allocations happening within `search`.
*
* Usage example:
* @code{.cpp}
* using namespace cuvs::neighbors;
*
* // use default index parameters
* brute_force::index_params index_params;
* // create and fill the index from a [N, D] dataset
* brute_force::index_params index_params;
* auto index = brute_force::build(handle, index_params, dataset);
* // use default search parameters
* brute_force::search_params search_params;
* // create a bitset to filter the search
* auto removed_indices = raft::make_device_vector<int64_t, int64_t>(res, n_removed_indices);
* raft::core::bitset<std::uint32_t, int64_t> removed_indices_bitset(
* res, removed_indices.view(), dataset.extent(0));
* // search K nearest neighbours according to a bitset
* auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
* auto distances = raft::make_device_matrix<half>(res, n_queries, k);
* auto filter = filtering::bitset_filter(removed_indices_bitset.view());
* brute_force::search(res, search_params, index, queries, neighbors, distances, filter);
* @endcode
*
* @param[in] handle
* @param[in] params parameters configuring the search
* @param[in] index bruteforce constructed index
* @param[in] queries a device pointer to a col-major matrix [n_queries, index->dim()]
* @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
* [n_queries, k]
* @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
* @param[in] sample_filter an optional device bitmap filter function that greenlights samples for a
* given query
* @param[in] sample_filter An optional device filter that restricts which dataset elements should
* be considered for each query.
*
* - Supports two types of filters:
* 1. **Bitset Filter**: A shared filter where each bit corresponds to a dataset element.
* All queries share the same filter, with a logical shape of `[1, index->size()]`.
* 2. **Bitmap Filter**: A per-query filter with a logical shape of `[n_queries, index->size()]`,
* where each bit indicates whether a specific dataset element should be considered for a
* particular query. (1 for inclusion, 0 for exclusion).
*
* - The default value is `none_sample_filter`, which applies no filtering.
*/
void search(raft::resources const& handle,
const cuvs::neighbors::brute_force::search_params& params,
Expand Down
41 changes: 36 additions & 5 deletions cpp/include/cuvs/neighbors/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cstdint>
#include <cuvs/distance/distance.hpp>
#include <raft/core/device_csr_matrix.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_resources.hpp>
#include <raft/core/host_mdspan.hpp>
Expand Down Expand Up @@ -456,8 +457,16 @@ inline constexpr bool is_vpq_dataset_v = is_vpq_dataset<DatasetT>::value;

namespace filtering {

/**
* @defgroup neighbors_filtering Filtering for ANN Types
* @{
*/

enum class FilterType { None, Bitmap, Bitset };

struct base_filter {
virtual ~base_filter() = default;
virtual ~base_filter() = default;
virtual FilterType get_filter_type() const = 0;
};

/* A filter that filters nothing. This is the default behavior. */
Expand All @@ -475,6 +484,8 @@ struct none_sample_filter : public base_filter {
const uint32_t query_ix,
// the index of the current sample
const uint32_t sample_ix) const;

FilterType get_filter_type() const override { return FilterType::None; }
};

/**
Expand Down Expand Up @@ -513,15 +524,24 @@ struct ivf_to_sample_filter {
*/
template <typename bitmap_t, typename index_t>
struct bitmap_filter : public base_filter {
using view_t = cuvs::core::bitmap_view<bitmap_t, index_t>;

// View of the bitset to use as a filter
const cuvs::core::bitmap_view<bitmap_t, index_t> bitmap_view_;
const view_t bitmap_view_;

bitmap_filter(const cuvs::core::bitmap_view<bitmap_t, index_t> bitmap_for_filtering);
bitmap_filter(const view_t bitmap_for_filtering);
inline _RAFT_HOST_DEVICE bool operator()(
// query index
const uint32_t query_ix,
// the index of the current sample
const uint32_t sample_ix) const;

FilterType get_filter_type() const override { return FilterType::Bitmap; }

view_t view() const { return bitmap_view_; }

template <typename csr_matrix_t>
void to_csr(raft::resources const& handle, csr_matrix_t& csr);
};

/**
Expand All @@ -532,17 +552,28 @@ struct bitmap_filter : public base_filter {
*/
template <typename bitset_t, typename index_t>
struct bitset_filter : public base_filter {
using view_t = cuvs::core::bitset_view<bitset_t, index_t>;

// View of the bitset to use as a filter
const cuvs::core::bitset_view<bitset_t, index_t> bitset_view_;
const view_t bitset_view_;

bitset_filter(const cuvs::core::bitset_view<bitset_t, index_t> bitset_for_filtering);
bitset_filter(const view_t bitset_for_filtering);
inline _RAFT_HOST_DEVICE bool operator()(
// query index
const uint32_t query_ix,
// the index of the current sample
const uint32_t sample_ix) const;

FilterType get_filter_type() const override { return FilterType::Bitset; }

view_t view() const { return bitset_view_; }

template <typename csr_matrix_t>
void to_csr(raft::resources const& handle, csr_matrix_t& csr);
};

/** @} */ // end group neighbors_filtering

/**
* If the filtering depends on the index of a sample, then the following
* filter template can be used:
Expand Down
22 changes: 11 additions & 11 deletions cpp/include/cuvs/neighbors/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ struct cuvsHnswIndexParams {
/** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
int ef_construction;
/** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
to parallelism, and increasing the number of threads can reduce the quality of the index.
*/
When the value is 0, the number of threads is automatically determined to the maximum
number of threads available.
*/
int num_threads;
};

Expand Down Expand Up @@ -158,8 +158,8 @@ cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params);
* NOTE: When hierarchy is:
* 1. `NONE`: This method uses the filesystem to write the CAGRA index in
* `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
* file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
* the format is not compatible with the original hnswlib.
* file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS,
* as the format is not compatible with the original hnswlib.
* 2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
* serialized index is also compatible with the original hnswlib library.
*
Expand Down Expand Up @@ -364,10 +364,10 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res,

/**
* @brief Serialize a CAGRA index to a file as an hnswlib index
* NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
* hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
* However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
* library.
* NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by
* the hnswlib wrapper in cuVS, as the serialization format is not compatible with the original
* hnswlib. However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the
* original hnswlib library.
*
* @param[in] res cuvsResources_t opaque C handle
* @param[in] filename the name of the file to save the index
Expand Down Expand Up @@ -406,8 +406,8 @@ cuvsError_t cuvsHnswSerialize(cuvsResources_t res, const char* filename, cuvsHns
/**
* Load hnswlib index from file which was serialized from a HNSW index.
* NOTE: When hierarchy is `NONE`, the loaded hnswlib index is immutable, and only be read by the
* hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
* Experimental, both the API and the serialization format are subject to change.
* hnswlib wrapper in cuVS, as the serialization format is not compatible with the original
* hnswlib. Experimental, both the API and the serialization format are subject to change.
*
* @code{.c}
* #include <cuvs/core/c_api.h>
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/cuvs/neighbors/hnsw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ struct index_params : cuvs::neighbors::index_params {
/** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
int ef_construction = 200;
/** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
to parallelism, and increasing the number of threads can reduce the quality of the index.
When the value is 0, the number of threads is automatically determined to the
maximum number of threads available.
*/
int num_threads = 2;
int num_threads = 0;
};

/**@}*/
Expand Down
Loading

0 comments on commit c69af18

Please sign in to comment.